From b2985e3f3c1beece6ede3f25ba03ef071c9645df Mon Sep 17 00:00:00 2001 From: =?utf8?q?J=C3=A9r=C3=B4me=20Benoit?= Date: Mon, 6 Oct 2025 20:04:13 +0200 Subject: [PATCH] perf(reforcexy): fine tune default reward settings MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Signed-off-by: Jérôme Benoit --- ReforceXY/reward_space_analysis/README.md | 2 +- .../reward_space_analysis.py | 12 +++---- .../test_reward_space_analysis.py | 36 +++++++++++++++++-- ReforceXY/user_data/freqaimodels/ReforceXY.py | 8 ++--- 4 files changed, 43 insertions(+), 15 deletions(-) diff --git a/ReforceXY/reward_space_analysis/README.md b/ReforceXY/reward_space_analysis/README.md index 32bbd13..ce5a3be 100644 --- a/ReforceXY/reward_space_analysis/README.md +++ b/ReforceXY/reward_space_analysis/README.md @@ -149,7 +149,7 @@ None - all parameters have sensible defaults. - Maximum trade duration in candles (from environment config) - Should match your actual trading environment setting -- Also used as fallback for `max_idle_duration_candles` when that tunable is ≤ 0 (idle penalty grace behaviour) +- Drives idle grace: when `max_idle_duration_candles` ≤ 0 the fallback = `2 * max_trade_duration` ### Reward Configuration diff --git a/ReforceXY/reward_space_analysis/reward_space_analysis.py b/ReforceXY/reward_space_analysis/reward_space_analysis.py index 01a9834..e671f53 100644 --- a/ReforceXY/reward_space_analysis/reward_space_analysis.py +++ b/ReforceXY/reward_space_analysis/reward_space_analysis.py @@ -130,7 +130,7 @@ DEFAULT_MODEL_REWARD_PARAMETERS: Dict[str, float | str] = { # Idle penalty (env defaults) "idle_penalty_power": 1.0, "idle_penalty_scale": 0.75, - # If <=0 or unset, falls back to max_trade_duration_candles at runtime + # Fallback semantics: if <=0 or unset → 2 * max_trade_duration_candles (grace window before full idle penalty) "max_idle_duration_candles": 0, # Holding keys (env defaults) "holding_penalty_scale": 0.5, @@ -158,7 +158,7 @@ DEFAULT_MODEL_REWARD_PARAMETERS_HELP: Dict[str, str] = { "base_factor": "Base reward factor used inside the environment.", "idle_penalty_power": "Power applied to idle penalty scaling.", "idle_penalty_scale": "Scale of idle penalty.", - "max_idle_duration_candles": "Maximum idle duration candles before full idle penalty scaling; 0 = use max_trade_duration_candles.", + "max_idle_duration_candles": "Maximum idle duration candles before full idle penalty scaling; 0 = use 2 * max_trade_duration_candles.", "holding_penalty_scale": "Scale of holding penalty.", "holding_penalty_power": "Power applied to holding penalty scaling.", "exit_factor_mode": "Time attenuation mode for exit factor.", @@ -489,18 +489,18 @@ def _idle_penalty( """Mirror the environment's idle penalty behaviour.""" idle_penalty_scale = _get_param_float(params, "idle_penalty_scale", 0.75) idle_penalty_power = _get_param_float(params, "idle_penalty_power", 1.0) - max_trade_duration = int(params.get("max_trade_duration_candles", 128)) + max_trade_duration_candles = params.get("max_trade_duration_candles", 128) max_idle_duration_candles = params.get("max_idle_duration_candles") try: max_idle_duration = ( int(max_idle_duration_candles) if max_idle_duration_candles is not None - else max_trade_duration + else 2 * max_trade_duration_candles ) except (TypeError, ValueError): - max_idle_duration = max_trade_duration + max_idle_duration = max_trade_duration_candles if max_idle_duration <= 0: - max_idle_duration = max_trade_duration + max_idle_duration = 2 * max_trade_duration_candles idle_duration_ratio = context.idle_duration / max(1, max_idle_duration) return -idle_factor * idle_penalty_scale * idle_duration_ratio**idle_penalty_power diff --git a/ReforceXY/reward_space_analysis/test_reward_space_analysis.py b/ReforceXY/reward_space_analysis/test_reward_space_analysis.py index 0fddd4c..86a98d2 100644 --- a/ReforceXY/reward_space_analysis/test_reward_space_analysis.py +++ b/ReforceXY/reward_space_analysis/test_reward_space_analysis.py @@ -461,7 +461,7 @@ class TestRewardAlignment(RewardSpaceTestBase): ) def test_max_idle_duration_candles_logic(self): - """Idle penalty scaling test with explicit max_idle_duration_candles (no force-action comparisons).""" + """Idle penalty scaling test with explicit max_idle_duration_candles.""" params_small = self.DEFAULT_PARAMS.copy() params_large = self.DEFAULT_PARAMS.copy() # Activate explicit max idle durations @@ -511,9 +511,13 @@ class TestRewardAlignment(RewardSpaceTestBase): ) def test_idle_penalty_fallback_and_proportionality(self): - """When max_idle_duration_candles <= 0, fallback to max_trade_duration and ensure proportional scaling. + """Fallback & proportionality validation. - Also validates that penalty doubles (approximately) when idle_duration doubles (holding other params constant). + Semantics: + - When max_idle_duration_candles <= 0, fallback must be 2 * max_trade_duration (updated rule). + - Idle penalty scales ~ linearly with idle_duration (power=1), so doubling idle_duration doubles penalty magnitude. + - We also infer the implicit denominator from a mid-range idle duration (>1x and <2x trade duration) to ensure the + 2x fallback. """ params = self.DEFAULT_PARAMS.copy() params["max_idle_duration_candles"] = 0 # force fallback @@ -569,6 +573,32 @@ class TestRewardAlignment(RewardSpaceTestBase): tolerance=0.2, msg=f"Idle penalty proportionality mismatch (ratio={ratio})", ) + # Additional mid-range inference check (idle_duration between 1x and 2x trade duration) + ctx_mid = dataclasses.replace(ctx_a, idle_duration=120, max_trade_duration=100) + br_mid = calculate_reward( + ctx_mid, + params, + base_factor=base_factor, + profit_target=profit_target, + risk_reward_ratio=risk_reward_ratio, + short_allowed=True, + action_masking=True, + ) + self.assertLess(br_mid.idle_penalty, 0.0) + idle_penalty_scale = float(params.get("idle_penalty_scale", 0.75)) + idle_penalty_power = float(params.get("idle_penalty_power", 1.0)) + # Internal factor may come from params (overrides provided base_factor argument) + factor_used = float(params.get("base_factor", base_factor)) + idle_factor = factor_used * (profit_target * risk_reward_ratio) / 3.0 + observed_ratio = abs(br_mid.idle_penalty) / (idle_factor * idle_penalty_scale) + if observed_ratio > 0: + implied_D = 120 / (observed_ratio ** (1 / idle_penalty_power)) + self.assertAlmostEqualFloat( + implied_D, + 200.0, + tolerance=12.0, # modest tolerance for float ops / rounding + msg=f"Fallback denominator mismatch (implied={implied_D}, expected≈200, factor_used={factor_used})", + ) def test_exit_factor_threshold_warning_non_capping(self): """Ensure exit_factor_threshold does not cap the exit factor (warning-only semantics). diff --git a/ReforceXY/user_data/freqaimodels/ReforceXY.py b/ReforceXY/user_data/freqaimodels/ReforceXY.py index e988339..3f3e055 100644 --- a/ReforceXY/user_data/freqaimodels/ReforceXY.py +++ b/ReforceXY/user_data/freqaimodels/ReforceXY.py @@ -1392,9 +1392,7 @@ class MyRLEnv(Base5ActionRLEnv): duration_ratio = 0.0 model_reward_parameters = self.rl_config.get("model_reward_parameters", {}) - exit_factor_mode = str( - model_reward_parameters.get("exit_factor_mode", "piecewise") - ).lower() + exit_factor_mode = model_reward_parameters.get("exit_factor_mode", "piecewise") def _legacy(f: float, dr: float, p: Mapping) -> float: return f * (1.5 if dr <= 1.0 else 0.5) @@ -1599,11 +1597,11 @@ class MyRLEnv(Base5ActionRLEnv): if action == Actions.Neutral.value and self._position == Positions.Neutral: max_idle_duration = int( model_reward_parameters.get( - "max_idle_duration_candles", max_trade_duration + "max_idle_duration_candles", 2 * max_trade_duration ) ) if max_idle_duration <= 0: - max_idle_duration = max_trade_duration + max_idle_duration = 2 * max_trade_duration idle_penalty_scale = float( model_reward_parameters.get("idle_penalty_scale", 0.75) ) -- 2.43.0