]> Piment Noir Git Repositories - freqai-strategies.git/commitdiff
perf(reforcexy): refine default reward tunables
authorJérôme Benoit <jerome.benoit@piment-noir.org>
Thu, 9 Oct 2025 20:18:51 +0000 (22:18 +0200)
committerJérôme Benoit <jerome.benoit@piment-noir.org>
Thu, 9 Oct 2025 20:18:51 +0000 (22:18 +0200)
Signed-off-by: Jérôme Benoit <jerome.benoit@piment-noir.org>
ReforceXY/reward_space_analysis/README.md
ReforceXY/reward_space_analysis/reward_space_analysis.py
ReforceXY/reward_space_analysis/test_reward_space_analysis.py
ReforceXY/user_data/freqaimodels/ReforceXY.py

index 6c1b753aecd9eecdcc9adcfe086388c7ed1df5c4..dd44b589ab3f0bcbe5e2f027ca5ddfbfd0b218e4 100644 (file)
@@ -259,13 +259,13 @@ _Invalid action penalty:_
 
 _Idle penalty configuration:_
 
-- `idle_penalty_scale` (default: 0.75) - Scale of idle penalty
-- `idle_penalty_power` (default: 1.0) - Power applied to idle penalty scaling
+- `idle_penalty_scale` (default: 0.5) - Scale of idle penalty
+- `idle_penalty_power` (default: 1.025) - Power applied to idle penalty scaling
 
 _Holding penalty configuration:_
 
-- `holding_penalty_scale` (default: 0.5) - Scale of holding penalty
-- `holding_penalty_power` (default: 1.0) - Power applied to holding penalty scaling
+- `holding_penalty_scale` (default: 0.25) - Scale of holding penalty
+- `holding_penalty_power` (default: 1.025) - Power applied to holding penalty scaling
 
 _Exit attenuation configuration:_
 
index 3cf83842d1bf57c8daa91dcc0e899d202792ce08..c9c88e0e949b93f83dca08e06ccaea67cada3504 100644 (file)
@@ -125,13 +125,13 @@ DEFAULT_MODEL_REWARD_PARAMETERS: Dict[str, float | str] = {
     "invalid_action": -2.0,
     "base_factor": 100.0,
     # Idle penalty (env defaults)
-    "idle_penalty_power": 1.0,
-    "idle_penalty_scale": 0.75,
+    "idle_penalty_scale": 0.5,
+    "idle_penalty_power": 1.025,
     # Fallback semantics: if <=0 or unset → 2 * max_trade_duration_candles (grace window before full idle penalty)
     "max_idle_duration_candles": 0,
     # Holding keys (env defaults)
-    "holding_penalty_scale": 0.5,
-    "holding_penalty_power": 1.0,
+    "holding_penalty_scale": 0.25,
+    "holding_penalty_power": 1.025,
     # Exit attenuation configuration (env default)
     "exit_attenuation_mode": "linear",
     "exit_plateau": True,
@@ -512,8 +512,16 @@ def _idle_penalty(
     context: RewardContext, idle_factor: float, params: Dict[str, float | str]
 ) -> float:
     """Mirror the environment's idle penalty behaviour."""
-    idle_penalty_scale = _get_param_float(params, "idle_penalty_scale", 0.75)
-    idle_penalty_power = _get_param_float(params, "idle_penalty_power", 1.0)
+    idle_penalty_scale = _get_param_float(
+        params,
+        "idle_penalty_scale",
+        DEFAULT_MODEL_REWARD_PARAMETERS.get("idle_penalty_scale", 0.5),
+    )
+    idle_penalty_power = _get_param_float(
+        params,
+        "idle_penalty_power",
+        DEFAULT_MODEL_REWARD_PARAMETERS.get("idle_penalty_power", 1.025),
+    )
     max_trade_duration_candles = params.get("max_trade_duration_candles")
     try:
         if max_trade_duration_candles is not None:
@@ -542,8 +550,16 @@ def _holding_penalty(
     context: RewardContext, holding_factor: float, params: Dict[str, float | str]
 ) -> float:
     """Mirror the environment's holding penalty behaviour."""
-    holding_penalty_scale = _get_param_float(params, "holding_penalty_scale", 0.5)
-    holding_penalty_power = _get_param_float(params, "holding_penalty_power", 1.0)
+    holding_penalty_scale = _get_param_float(
+        params,
+        "holding_penalty_scale",
+        DEFAULT_MODEL_REWARD_PARAMETERS.get("holding_penalty_scale", 0.25),
+    )
+    holding_penalty_power = _get_param_float(
+        params,
+        "holding_penalty_power",
+        DEFAULT_MODEL_REWARD_PARAMETERS.get("holding_penalty_power", 1.025),
+    )
     duration_ratio = _compute_duration_ratio(
         context.trade_duration, context.max_trade_duration
     )
index 9bfe8569acd59836a7c5d8e3cf3a9e80a82fd4e0..3d70992b59b93627009e1bc22aba275e3617f276 100644 (file)
@@ -636,8 +636,8 @@ class TestRewardAlignment(RewardSpaceTestBase):
             action_masking=True,
         )
         self.assertLess(br_mid.idle_penalty, 0.0)
-        idle_penalty_scale = float(params.get("idle_penalty_scale", 0.75))
-        idle_penalty_power = float(params.get("idle_penalty_power", 1.0))
+        idle_penalty_scale = float(params.get("idle_penalty_scale", 0.5))
+        idle_penalty_power = float(params.get("idle_penalty_power", 1.025))
         # Internal factor may come from params (overrides provided base_factor argument)
         factor_used = float(params.get("base_factor", base_factor))
         idle_factor = factor_used * (profit_target * risk_reward_ratio) / 3.0
index 01fa6d81ae41a56009ab08a157fe534661d31071..b0269acf587d626db6116e99d1a0d07ad297fa8a 100644 (file)
@@ -1616,10 +1616,10 @@ class MyRLEnv(Base5ActionRLEnv):
             if max_idle_duration <= 0:
                 max_idle_duration = 2 * max_trade_duration
             idle_penalty_scale = float(
-                model_reward_parameters.get("idle_penalty_scale", 0.75)
+                model_reward_parameters.get("idle_penalty_scale", 0.5)
             )
             idle_penalty_power = float(
-                model_reward_parameters.get("idle_penalty_power", 1.0)
+                model_reward_parameters.get("idle_penalty_power", 1.025)
             )
             idle_duration = self.get_idle_duration()
             idle_duration_ratio = idle_duration / max(1, max_idle_duration)
@@ -1635,10 +1635,10 @@ class MyRLEnv(Base5ActionRLEnv):
             and action == Actions.Neutral.value
         ):
             holding_penalty_scale = float(
-                model_reward_parameters.get("holding_penalty_scale", 0.5)
+                model_reward_parameters.get("holding_penalty_scale", 0.25)
             )
             holding_penalty_power = float(
-                model_reward_parameters.get("holding_penalty_power", 1.0)
+                model_reward_parameters.get("holding_penalty_power", 1.025)
             )
 
             if duration_ratio < 1.0: