From: Jérôme Benoit Date: Thu, 9 Oct 2025 20:18:51 +0000 (+0200) Subject: perf(reforcexy): refine default reward tunables X-Git-Url: https://git.piment-noir.org/?a=commitdiff_plain;h=3919b0f00656f1788a0a15e7183eafa970561361;p=freqai-strategies.git perf(reforcexy): refine default reward tunables Signed-off-by: Jérôme Benoit --- diff --git a/ReforceXY/reward_space_analysis/README.md b/ReforceXY/reward_space_analysis/README.md index 6c1b753..dd44b58 100644 --- a/ReforceXY/reward_space_analysis/README.md +++ b/ReforceXY/reward_space_analysis/README.md @@ -259,13 +259,13 @@ _Invalid action penalty:_ _Idle penalty configuration:_ -- `idle_penalty_scale` (default: 0.75) - Scale of idle penalty -- `idle_penalty_power` (default: 1.0) - Power applied to idle penalty scaling +- `idle_penalty_scale` (default: 0.5) - Scale of idle penalty +- `idle_penalty_power` (default: 1.025) - Power applied to idle penalty scaling _Holding penalty configuration:_ -- `holding_penalty_scale` (default: 0.5) - Scale of holding penalty -- `holding_penalty_power` (default: 1.0) - Power applied to holding penalty scaling +- `holding_penalty_scale` (default: 0.25) - Scale of holding penalty +- `holding_penalty_power` (default: 1.025) - Power applied to holding penalty scaling _Exit attenuation configuration:_ diff --git a/ReforceXY/reward_space_analysis/reward_space_analysis.py b/ReforceXY/reward_space_analysis/reward_space_analysis.py index 3cf8384..c9c88e0 100644 --- a/ReforceXY/reward_space_analysis/reward_space_analysis.py +++ b/ReforceXY/reward_space_analysis/reward_space_analysis.py @@ -125,13 +125,13 @@ DEFAULT_MODEL_REWARD_PARAMETERS: Dict[str, float | str] = { "invalid_action": -2.0, "base_factor": 100.0, # Idle penalty (env defaults) - "idle_penalty_power": 1.0, - "idle_penalty_scale": 0.75, + "idle_penalty_scale": 0.5, + "idle_penalty_power": 1.025, # Fallback semantics: if <=0 or unset → 2 * max_trade_duration_candles (grace window before full idle penalty) "max_idle_duration_candles": 0, # Holding keys (env defaults) - "holding_penalty_scale": 0.5, - "holding_penalty_power": 1.0, + "holding_penalty_scale": 0.25, + "holding_penalty_power": 1.025, # Exit attenuation configuration (env default) "exit_attenuation_mode": "linear", "exit_plateau": True, @@ -512,8 +512,16 @@ def _idle_penalty( context: RewardContext, idle_factor: float, params: Dict[str, float | str] ) -> float: """Mirror the environment's idle penalty behaviour.""" - idle_penalty_scale = _get_param_float(params, "idle_penalty_scale", 0.75) - idle_penalty_power = _get_param_float(params, "idle_penalty_power", 1.0) + idle_penalty_scale = _get_param_float( + params, + "idle_penalty_scale", + DEFAULT_MODEL_REWARD_PARAMETERS.get("idle_penalty_scale", 0.5), + ) + idle_penalty_power = _get_param_float( + params, + "idle_penalty_power", + DEFAULT_MODEL_REWARD_PARAMETERS.get("idle_penalty_power", 1.025), + ) max_trade_duration_candles = params.get("max_trade_duration_candles") try: if max_trade_duration_candles is not None: @@ -542,8 +550,16 @@ def _holding_penalty( context: RewardContext, holding_factor: float, params: Dict[str, float | str] ) -> float: """Mirror the environment's holding penalty behaviour.""" - holding_penalty_scale = _get_param_float(params, "holding_penalty_scale", 0.5) - holding_penalty_power = _get_param_float(params, "holding_penalty_power", 1.0) + holding_penalty_scale = _get_param_float( + params, + "holding_penalty_scale", + DEFAULT_MODEL_REWARD_PARAMETERS.get("holding_penalty_scale", 0.25), + ) + holding_penalty_power = _get_param_float( + params, + "holding_penalty_power", + DEFAULT_MODEL_REWARD_PARAMETERS.get("holding_penalty_power", 1.025), + ) duration_ratio = _compute_duration_ratio( context.trade_duration, context.max_trade_duration ) diff --git a/ReforceXY/reward_space_analysis/test_reward_space_analysis.py b/ReforceXY/reward_space_analysis/test_reward_space_analysis.py index 9bfe856..3d70992 100644 --- a/ReforceXY/reward_space_analysis/test_reward_space_analysis.py +++ b/ReforceXY/reward_space_analysis/test_reward_space_analysis.py @@ -636,8 +636,8 @@ class TestRewardAlignment(RewardSpaceTestBase): action_masking=True, ) self.assertLess(br_mid.idle_penalty, 0.0) - idle_penalty_scale = float(params.get("idle_penalty_scale", 0.75)) - idle_penalty_power = float(params.get("idle_penalty_power", 1.0)) + idle_penalty_scale = float(params.get("idle_penalty_scale", 0.5)) + idle_penalty_power = float(params.get("idle_penalty_power", 1.025)) # Internal factor may come from params (overrides provided base_factor argument) factor_used = float(params.get("base_factor", base_factor)) idle_factor = factor_used * (profit_target * risk_reward_ratio) / 3.0 diff --git a/ReforceXY/user_data/freqaimodels/ReforceXY.py b/ReforceXY/user_data/freqaimodels/ReforceXY.py index 01fa6d8..b0269ac 100644 --- a/ReforceXY/user_data/freqaimodels/ReforceXY.py +++ b/ReforceXY/user_data/freqaimodels/ReforceXY.py @@ -1616,10 +1616,10 @@ class MyRLEnv(Base5ActionRLEnv): if max_idle_duration <= 0: max_idle_duration = 2 * max_trade_duration idle_penalty_scale = float( - model_reward_parameters.get("idle_penalty_scale", 0.75) + model_reward_parameters.get("idle_penalty_scale", 0.5) ) idle_penalty_power = float( - model_reward_parameters.get("idle_penalty_power", 1.0) + model_reward_parameters.get("idle_penalty_power", 1.025) ) idle_duration = self.get_idle_duration() idle_duration_ratio = idle_duration / max(1, max_idle_duration) @@ -1635,10 +1635,10 @@ class MyRLEnv(Base5ActionRLEnv): and action == Actions.Neutral.value ): holding_penalty_scale = float( - model_reward_parameters.get("holding_penalty_scale", 0.5) + model_reward_parameters.get("holding_penalty_scale", 0.25) ) holding_penalty_power = float( - model_reward_parameters.get("holding_penalty_power", 1.0) + model_reward_parameters.get("holding_penalty_power", 1.025) ) if duration_ratio < 1.0: