From 6ae066c91237df40d74f941f14fc56f3a2bbcd6a Mon Sep 17 00:00:00 2001 From: =?utf8?q?J=C3=A9r=C3=B4me=20Benoit?= Date: Sun, 28 Dec 2025 16:09:34 +0100 Subject: [PATCH] refactor(ReforceXY): normalize tunables namespace MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Signed-off-by: Jérôme Benoit --- ReforceXY/reward_space_analysis/README.md | 22 +++++++++---------- .../reward_space_analysis.py | 14 +++++++----- .../components/test_reward_components.py | 10 ++++----- .../tests/helpers/test_internal_branches.py | 2 +- .../tests/pbrs/test_pbrs.py | 12 +++++++--- ReforceXY/user_data/freqaimodels/ReforceXY.py | 9 ++++---- 6 files changed, 39 insertions(+), 30 deletions(-) diff --git a/ReforceXY/reward_space_analysis/README.md b/ReforceXY/reward_space_analysis/README.md index 95c53ee..65450f4 100644 --- a/ReforceXY/reward_space_analysis/README.md +++ b/ReforceXY/reward_space_analysis/README.md @@ -52,7 +52,7 @@ Full test documentation: [tests/README.md](./tests/README.md). - [Reward & Shaping](#reward--shaping) - [Diagnostics & Validation](#diagnostics--validation) - [Overrides](#overrides) - - [Reward Parameter Cheat Sheet](#reward-parameter-cheat-sheet) + - [Reward Tunables Reference](#reward-tunables-reference) - [Exit Attenuation Kernels](#exit-attenuation-kernels) - [Transform Functions](#transform-functions) - [Skipping Feature Analysis](#skipping-feature-analysis) @@ -220,7 +220,7 @@ be overridden via `--params`. scalars (`profit_aim`, `risk_reward_ratio`, `action_masking`). Conflicts: individual flags vs `--params` ⇒ `--params` wins. -### Reward Parameter Cheat Sheet +### Reward Tunables Reference #### Core @@ -237,12 +237,12 @@ The exit factor is computed as: ##### PnL Target -| Parameter | Default | Description | -| ------------------- | ------- | ----------------------------- | -| `profit_aim` | 0.03 | Profit target threshold | -| `risk_reward_ratio` | 2.0 | Risk/reward multiplier | -| `win_reward_factor` | 2.0 | Profit target bonus factor | -| `pnl_factor_beta` | 0.5 | PnL amplification sensitivity | +| Parameter | Default | Description | +| ------------------------------- | ------- | ----------------------------- | +| `profit_aim` | 0.03 | Profit target threshold | +| `risk_reward_ratio` | 2.0 | Risk/reward multiplier | +| `win_reward_factor` | 2.0 | Profit target bonus factor | +| `pnl_amplification_sensitivity` | 0.5 | PnL amplification sensitivity | **Note:** In ReforceXY, `risk_reward_ratio` maps to `rr`. @@ -252,9 +252,9 @@ Let `pnl_target = profit_aim × risk_reward_ratio`, `pnl_ratio = pnl / pnl_targe - If `pnl_target ≤ 0`: `pnl_target_coefficient = 1.0` - If `pnl_ratio > 1.0`: - `pnl_target_coefficient = 1.0 + win_reward_factor × tanh(pnl_factor_beta × (pnl_ratio − 1.0))` + `pnl_target_coefficient = 1.0 + win_reward_factor × tanh(pnl_amplification_sensitivity × (pnl_ratio − 1.0))` - If `pnl_ratio < −(1.0 / risk_reward_ratio)`: - `pnl_target_coefficient = 1.0 + (win_reward_factor × risk_reward_ratio) × tanh(pnl_factor_beta × (|pnl_ratio| − 1.0))` + `pnl_target_coefficient = 1.0 + (win_reward_factor × risk_reward_ratio) × tanh(pnl_amplification_sensitivity × (|pnl_ratio| − 1.0))` - Else: `pnl_target_coefficient = 1.0` ##### Efficiency @@ -465,7 +465,7 @@ uv run python reward_space_analysis.py --params win_reward_factor=3.0 idle_penal `risk_reward_ratio`, `action_masking`. **Reward tunables** (tunable via either direct flag or `--params`) correspond to -those listed under Reward Parameter Cheat Sheet: Core, Duration Penalties, Exit +those listed under Reward Tunables Reference: Core, Duration Penalties, Exit Attenuation, Efficiency, Validation, PBRS, Hold/Entry/Exit Potential Transforms. ## Examples diff --git a/ReforceXY/reward_space_analysis/reward_space_analysis.py b/ReforceXY/reward_space_analysis/reward_space_analysis.py index cbac069..1db151c 100644 --- a/ReforceXY/reward_space_analysis/reward_space_analysis.py +++ b/ReforceXY/reward_space_analysis/reward_space_analysis.py @@ -151,7 +151,7 @@ DEFAULT_MODEL_REWARD_PARAMETERS: RewardParams = { "efficiency_center": 0.5, # Profit factor defaults "win_reward_factor": 2.0, - "pnl_factor_beta": 0.5, + "pnl_amplification_sensitivity": 0.5, # Invariant / safety defaults "check_invariants": True, "exit_factor_threshold": 1000.0, @@ -202,7 +202,7 @@ DEFAULT_MODEL_REWARD_PARAMETERS_HELP: dict[str, str] = { "efficiency_weight": "Efficiency weight", "efficiency_center": "Efficiency pivot in [0,1]", "win_reward_factor": "Profit overshoot bonus factor", - "pnl_factor_beta": "PnL amplification sensitivity", + "pnl_amplification_sensitivity": "PnL amplification sensitivity", "check_invariants": "Enable runtime invariant checks", "exit_factor_threshold": "Warn if |exit_factor| exceeds", # PBRS parameters @@ -250,7 +250,7 @@ _PARAMETER_BOUNDS: dict[str, dict[str, float]] = { "efficiency_weight": {"min": 0.0, "max": 2.0}, "efficiency_center": {"min": 0.0, "max": 1.0}, "win_reward_factor": {"min": 0.0}, - "pnl_factor_beta": {"min": 1e-6}, + "pnl_amplification_sensitivity": {"min": 1e-6}, # PBRS parameter bounds "potential_gamma": {"min": 0.0, "max": 1.0}, "exit_potential_decay": {"min": 0.0, "max": 1.0}, @@ -992,12 +992,14 @@ def _compute_pnl_target_coefficient( if pnl_target > 0.0: win_reward_factor = _get_float_param(params, "win_reward_factor") - pnl_factor_beta = _get_float_param(params, "pnl_factor_beta") + pnl_amplification_sensitivity = _get_float_param(params, "pnl_amplification_sensitivity") rr = risk_reward_ratio if risk_reward_ratio > 0 else RISK_REWARD_RATIO_DEFAULT pnl_ratio = pnl / pnl_target if abs(pnl_ratio) > 1.0: - base_pnl_target_coefficient = math.tanh(pnl_factor_beta * (abs(pnl_ratio) - 1.0)) + base_pnl_target_coefficient = math.tanh( + pnl_amplification_sensitivity * (abs(pnl_ratio) - 1.0) + ) if pnl_ratio > 1.0: pnl_target_coefficient = 1.0 + win_reward_factor * base_pnl_target_coefficient elif pnl_ratio < -(1.0 / rr): @@ -1285,7 +1287,7 @@ def calculate_reward( center_unrealized = 0.5 * ( context.max_unrealized_profit + context.min_unrealized_profit ) - beta = _get_float_param(params, "pnl_factor_beta") + beta = _get_float_param(params, "pnl_amplification_sensitivity") next_pnl = float(center_unrealized * math.tanh(beta * next_duration_ratio)) else: next_pnl = current_pnl diff --git a/ReforceXY/reward_space_analysis/tests/components/test_reward_components.py b/ReforceXY/reward_space_analysis/tests/components/test_reward_components.py index f5410d6..da99aa1 100644 --- a/ReforceXY/reward_space_analysis/tests/components/test_reward_components.py +++ b/ReforceXY/reward_space_analysis/tests/components/test_reward_components.py @@ -220,13 +220,13 @@ class TestRewardComponents(RewardSpaceTestBase): **Setup:** - PnL: 150% of pnl_target (exceeds target by 50%) - pnl_target: 0.045 (profit_aim=0.03 * risk_reward_ratio=1.5) - - Parameters: win_reward_factor=2.0, pnl_factor_beta=0.5 + - Parameters: win_reward_factor=2.0, pnl_amplification_sensitivity=0.5 **Assertions:** - Coefficient is finite - Coefficient > 1.0 (rewards exceeding target) """ - params = self.base_params(win_reward_factor=2.0, pnl_factor_beta=0.5) + params = self.base_params(win_reward_factor=2.0, pnl_amplification_sensitivity=0.5) profit_aim = 0.03 risk_reward_ratio = 1.5 pnl_target = profit_aim * risk_reward_ratio @@ -252,13 +252,13 @@ class TestRewardComponents(RewardSpaceTestBase): - PnL: -0.06 (exceeds pnl_target magnitude) - pnl_target: 0.045 (profit_aim=0.03 * risk_reward_ratio=1.5) - Penalty threshold: pnl < -pnl_target = -0.045 - - Parameters: win_reward_factor=2.0, pnl_factor_beta=0.5 + - Parameters: win_reward_factor=2.0, pnl_amplification_sensitivity=0.5 **Assertions:** - Coefficient is finite - Coefficient > 1.0 (amplifies loss penalty) """ - params = self.base_params(win_reward_factor=2.0, pnl_factor_beta=0.5) + params = self.base_params(win_reward_factor=2.0, pnl_amplification_sensitivity=0.5) profit_aim = 0.03 risk_reward_ratio = 1.5 pnl_target = profit_aim * risk_reward_ratio # 0.045 @@ -652,7 +652,7 @@ class TestRewardComponents(RewardSpaceTestBase): pnl_target = profit_aim * risk_reward_ratio params = self.base_params( win_reward_factor=win_reward_factor, - pnl_factor_beta=beta, + pnl_amplification_sensitivity=beta, efficiency_weight=0.0, exit_attenuation_mode="linear", exit_plateau=False, diff --git a/ReforceXY/reward_space_analysis/tests/helpers/test_internal_branches.py b/ReforceXY/reward_space_analysis/tests/helpers/test_internal_branches.py index 61419c8..ce62d88 100644 --- a/ReforceXY/reward_space_analysis/tests/helpers/test_internal_branches.py +++ b/ReforceXY/reward_space_analysis/tests/helpers/test_internal_branches.py @@ -68,7 +68,7 @@ def test_calculate_reward_unrealized_pnl_hold_path(): params = { "hold_potential_enabled": True, "unrealized_pnl": True, - "pnl_factor_beta": 0.5, + "pnl_amplification_sensitivity": 0.5, } breakdown = calculate_reward_with_defaults( context, diff --git a/ReforceXY/reward_space_analysis/tests/pbrs/test_pbrs.py b/ReforceXY/reward_space_analysis/tests/pbrs/test_pbrs.py index cbbc3e5..fb56d63 100644 --- a/ReforceXY/reward_space_analysis/tests/pbrs/test_pbrs.py +++ b/ReforceXY/reward_space_analysis/tests/pbrs/test_pbrs.py @@ -132,7 +132,9 @@ class TestPBRS(RewardSpaceTestBase): ) gamma = _get_float_param( - params, "potential_gamma", DEFAULT_MODEL_REWARD_PARAMETERS.get("potential_gamma", 0.95) + params, + "potential_gamma", + DEFAULT_MODEL_REWARD_PARAMETERS.get("potential_gamma", 0.95), ) expected_next_potential = ( prev_potential / gamma if gamma not in (0.0, None) else prev_potential @@ -872,7 +874,9 @@ class TestPBRS(RewardSpaceTestBase): potential_gamma=0.9, ) gamma = _get_float_param( - params, "potential_gamma", DEFAULT_MODEL_REWARD_PARAMETERS.get("potential_gamma", 0.95) + params, + "potential_gamma", + DEFAULT_MODEL_REWARD_PARAMETERS.get("potential_gamma", 0.95), ) rng = np.random.default_rng(555) potentials = rng.uniform(0.05, 0.85, size=220) @@ -1126,7 +1130,9 @@ class TestPBRS(RewardSpaceTestBase): exit_potential_mode="canonical", ) gamma = _get_float_param( - params, "potential_gamma", DEFAULT_MODEL_REWARD_PARAMETERS.get("potential_gamma", 0.95) + params, + "potential_gamma", + DEFAULT_MODEL_REWARD_PARAMETERS.get("potential_gamma", 0.95), ) rng = np.random.default_rng(321) prev_potential = 0.0 diff --git a/ReforceXY/user_data/freqaimodels/ReforceXY.py b/ReforceXY/user_data/freqaimodels/ReforceXY.py index 6c8fa15..82459f9 100644 --- a/ReforceXY/user_data/freqaimodels/ReforceXY.py +++ b/ReforceXY/user_data/freqaimodels/ReforceXY.py @@ -172,7 +172,7 @@ class ReforceXY(BaseReinforcementLearningModel): DEFAULT_EXIT_LINEAR_SLOPE: Final[float] = 1.0 DEFAULT_EXIT_HALF_LIFE: Final[float] = 0.5 - DEFAULT_PNL_FACTOR_BETA: Final[float] = 0.5 + DEFAULT_PNL_AMPLIFICATION_SENSITIVITY: Final[float] = 0.5 DEFAULT_WIN_REWARD_FACTOR: Final[float] = 2.0 DEFAULT_EFFICIENCY_WEIGHT: Final[float] = 1.0 DEFAULT_EFFICIENCY_CENTER: Final[float] = 0.5 @@ -2895,16 +2895,17 @@ class MyRLEnv(Base5ActionRLEnv): pnl_target_coefficient = 1.0 if pnl_target > 0.0: - pnl_factor_beta = float( + pnl_amplification_sensitivity = float( model_reward_parameters.get( - "pnl_factor_beta", ReforceXY.DEFAULT_PNL_FACTOR_BETA + "pnl_amplification_sensitivity", + ReforceXY.DEFAULT_PNL_AMPLIFICATION_SENSITIVITY, ) ) pnl_ratio = pnl / pnl_target if abs(pnl_ratio) > 1.0: base_pnl_target_coefficient = math.tanh( - pnl_factor_beta * (abs(pnl_ratio) - 1.0) + pnl_amplification_sensitivity * (abs(pnl_ratio) - 1.0) ) win_reward_factor = float( model_reward_parameters.get( -- 2.43.0