From 2aecf3c978928a0522d89fbfc40bb26f906a4a41 Mon Sep 17 00:00:00 2001 From: =?utf8?q?J=C3=A9r=C3=B4me=20Benoit?= Date: Mon, 6 Oct 2025 19:01:36 +0200 Subject: [PATCH] perf(reforcexy): fine tune default rewarding settings MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Signed-off-by: Jérôme Benoit --- ReforceXY/reward_space_analysis/README.md | 12 +++--- .../reward_space_analysis.py | 40 +++++++++++-------- ReforceXY/user_data/freqaimodels/ReforceXY.py | 8 ++-- 3 files changed, 32 insertions(+), 28 deletions(-) diff --git a/ReforceXY/reward_space_analysis/README.md b/ReforceXY/reward_space_analysis/README.md index 6f4fe20..32bbd13 100644 --- a/ReforceXY/reward_space_analysis/README.md +++ b/ReforceXY/reward_space_analysis/README.md @@ -218,7 +218,7 @@ _Invalid action penalty:_ _Idle penalty configuration:_ -- `idle_penalty_scale` (default: 1.0) - Scale of idle penalty +- `idle_penalty_scale` (default: 0.75) - Scale of idle penalty - `idle_penalty_power` (default: 1.0) - Power applied to idle penalty scaling _Holding penalty configuration:_ @@ -238,8 +238,8 @@ _Exit factor configuration:_ _Efficiency configuration:_ -- `efficiency_weight` (default: 0.75) - Weight for efficiency factor in exit reward -- `efficiency_center` (default: 0.75) - Center for efficiency factor sigmoid +- `efficiency_weight` (default: 1.0) - Weight for efficiency factor in exit reward +- `efficiency_center` (default: 0.35) - Linear pivot in [0,1] for efficiency ratio. If efficiency_ratio > center ⇒ amplification (>1); if < center ⇒ attenuation (<1, floored at 0). _Profit factor configuration:_ @@ -368,7 +368,7 @@ Key fields: | `seed` | Random seed used (deterministic cascade) | | `profit_target_effective` | Profit target after risk/reward scaling | | `top_features` | Top 5 features by permutation importance | -| `reward_param_overrides` | Subset of reward tunables whose values differ from defaults | +| `reward_param_overrides` | Subset of reward tunables explicitly supplied via CLI | | `params_hash` | SHA-256 hash combining simulation params + overrides (reproducibility) | | `params` | Echo of core simulation parameters (subset, for quick audit) | | `parameter_adjustments` | Any automatic bound clamps applied by `validate_reward_parameters` | @@ -678,8 +678,8 @@ Before simulation (early in `main()`), `validate_reward_parameters` enforces num | `exit_power_tau` | 1e-6 | 1.0 | Mapped to alpha = -ln(tau) | | `exit_half_life` | 1e-6 | — | Half-life in duration ratio units | | `efficiency_weight` | 0.0 | 2.0 | Blend weight | -| `efficiency_center` | 0.0 | 1.0 | Sigmoid center | -| `win_reward_factor` | 0.0 | — | Amplification ≥ 0 | +| `efficiency_center` | 0.0 | 1.0 | Linear pivot (efficiency ratio center) | +| `win_reward_factor` | 0.0 | — | Amplification for pnl above target | | `pnl_factor_beta` | 1e-6 | — | Sensitivity ≥ tiny positive | Non-finite inputs are reset to the applicable minimum (or 0.0 if only a maximum is declared) and logged as adjustments. diff --git a/ReforceXY/reward_space_analysis/reward_space_analysis.py b/ReforceXY/reward_space_analysis/reward_space_analysis.py index 29afe59..01a9834 100644 --- a/ReforceXY/reward_space_analysis/reward_space_analysis.py +++ b/ReforceXY/reward_space_analysis/reward_space_analysis.py @@ -129,7 +129,7 @@ DEFAULT_MODEL_REWARD_PARAMETERS: Dict[str, float | str] = { "base_factor": 100.0, # Idle penalty (env defaults) "idle_penalty_power": 1.0, - "idle_penalty_scale": 1.0, + "idle_penalty_scale": 0.75, # If <=0 or unset, falls back to max_trade_duration_candles at runtime "max_idle_duration_candles": 0, # Holding keys (env defaults) @@ -143,8 +143,8 @@ DEFAULT_MODEL_REWARD_PARAMETERS: Dict[str, float | str] = { "exit_power_tau": 0.5, "exit_half_life": 0.5, # Efficiency keys (env defaults) - "efficiency_weight": 0.75, - "efficiency_center": 0.75, + "efficiency_weight": 1.0, + "efficiency_center": 0.35, # Profit factor params (env defaults) "win_reward_factor": 2.0, "pnl_factor_beta": 0.5, @@ -169,7 +169,7 @@ DEFAULT_MODEL_REWARD_PARAMETERS_HELP: Dict[str, str] = { "exit_power_tau": "Tau in (0,1] to derive alpha for power mode.", "exit_half_life": "Half-life for exponential decay exit mode.", "efficiency_weight": "Weight for efficiency factor in exit reward.", - "efficiency_center": "Center for efficiency factor sigmoid.", + "efficiency_center": "Pivot (in [0,1]) for linear efficiency factor; efficiency_ratio above this increases factor, below decreases.", "win_reward_factor": "Amplification for pnl above target (no hard cap; asymptotic).", "pnl_factor_beta": "Sensitivity of amplification around target.", "check_invariants": "Boolean flag (true/false) to enable runtime invariant & safety checks.", @@ -443,8 +443,8 @@ def _get_pnl_factor( ) efficiency_factor = 1.0 - efficiency_weight = float(params.get("efficiency_weight", 0.75)) - efficiency_center = float(params.get("efficiency_center", 0.75)) + efficiency_weight = float(params.get("efficiency_weight", 1.0)) + efficiency_center = float(params.get("efficiency_center", 0.35)) if efficiency_weight != 0.0 and pnl >= 0.0: max_pnl = max(context.max_unrealized_profit, pnl) min_pnl = min(context.min_unrealized_profit, pnl) @@ -487,7 +487,7 @@ def _idle_penalty( context: RewardContext, idle_factor: float, params: Dict[str, float | str] ) -> float: """Mirror the environment's idle penalty behaviour.""" - idle_penalty_scale = _get_param_float(params, "idle_penalty_scale", 1.0) + idle_penalty_scale = _get_param_float(params, "idle_penalty_scale", 0.75) idle_penalty_power = _get_param_float(params, "idle_penalty_power", 1.0) max_trade_duration = int(params.get("max_trade_duration_candles", 128)) max_idle_duration_candles = params.get("max_idle_duration_candles") @@ -834,8 +834,8 @@ def _validate_simulation_invariants(df: pd.DataFrame) -> None: """ # INVARIANT 1: PnL Conservation - Total PnL must equal sum of exit PnL total_pnl = df["pnl"].sum() - exit_mask = df["reward_exit"] != 0 - exit_pnl_sum = df.loc[exit_mask, "pnl"].sum() + exit_action_mask = df["action"].isin([2.0, 4.0]) + exit_pnl_sum = df.loc[exit_action_mask, "pnl"].sum() pnl_diff = abs(total_pnl - exit_pnl_sum) if pnl_diff > 1e-10: @@ -846,8 +846,7 @@ def _validate_simulation_invariants(df: pd.DataFrame) -> None: # INVARIANT 2: PnL Exclusivity - Only exit actions should have non-zero PnL non_zero_pnl_actions = set(df[df["pnl"] != 0]["action"].unique()) - valid_exit_actions = {2.0, 4.0} # Long_exit, Short_exit - + valid_exit_actions = {2.0, 4.0} invalid_actions = non_zero_pnl_actions - valid_exit_actions if invalid_actions: raise AssertionError( @@ -2587,12 +2586,19 @@ def main() -> None: top_features = fi_df.head(5)["feature"].tolist() else: top_features = [] - # Detect reward parameter overrides vs defaults for traceability - reward_param_overrides = { - k: params[k] - for k in DEFAULT_MODEL_REWARD_PARAMETERS - if k in params and params[k] != DEFAULT_MODEL_REWARD_PARAMETERS[k] - } + # Detect reward parameter overrides for traceability. + reward_param_overrides = {} + # Step 1: differences + for k in DEFAULT_MODEL_REWARD_PARAMETERS: + if k in params and params[k] != DEFAULT_MODEL_REWARD_PARAMETERS[k]: + reward_param_overrides[k] = params[k] + # Step 2: explicit flags + for k in DEFAULT_MODEL_REWARD_PARAMETERS: + if hasattr(args, k): + v = getattr(args, k) + if v is not None: + # Use the resolved param value for consistency + reward_param_overrides[k] = params.get(k, v) manifest = { "generated_at": pd.Timestamp.now().isoformat(), diff --git a/ReforceXY/user_data/freqaimodels/ReforceXY.py b/ReforceXY/user_data/freqaimodels/ReforceXY.py index befc5fc..e988339 100644 --- a/ReforceXY/user_data/freqaimodels/ReforceXY.py +++ b/ReforceXY/user_data/freqaimodels/ReforceXY.py @@ -1516,11 +1516,9 @@ class MyRLEnv(Base5ActionRLEnv): ) efficiency_factor = 1.0 - efficiency_weight = float( - model_reward_parameters.get("efficiency_weight", 0.75) - ) + efficiency_weight = float(model_reward_parameters.get("efficiency_weight", 1.0)) efficiency_center = float( - model_reward_parameters.get("efficiency_center", 0.75) + model_reward_parameters.get("efficiency_center", 0.35) ) if efficiency_weight != 0.0 and pnl >= 0.0: max_pnl = max(self.get_max_unrealized_profit(), pnl) @@ -1607,7 +1605,7 @@ class MyRLEnv(Base5ActionRLEnv): if max_idle_duration <= 0: max_idle_duration = max_trade_duration idle_penalty_scale = float( - model_reward_parameters.get("idle_penalty_scale", 1.0) + model_reward_parameters.get("idle_penalty_scale", 0.75) ) idle_penalty_power = float( model_reward_parameters.get("idle_penalty_power", 1.0) -- 2.43.0