From: Jérôme Benoit Date: Wed, 24 Dec 2025 20:02:50 +0000 (+0100) Subject: chore(ReforceXY)!: rename idle/hold penalty scale to ratio X-Git-Url: https://git.piment-noir.org/?a=commitdiff_plain;h=d3b85a1c8e03138052dea9e15b20e62944bb190a;p=freqai-strategies.git chore(ReforceXY)!: rename idle/hold penalty scale to ratio Signed-off-by: Jérôme Benoit --- diff --git a/.gitignore b/.gitignore index 8860197..2e089cc 100644 --- a/.gitignore +++ b/.gitignore @@ -379,7 +379,7 @@ config.json **/user_data/data/** !.gitkeep -*/.serena/ +*/.serena */.serena/** */.clinerules */.clinerules/** diff --git a/.opencode/opencode.jsonc b/.opencode/opencode.jsonc new file mode 100644 index 0000000..2638a16 --- /dev/null +++ b/.opencode/opencode.jsonc @@ -0,0 +1,6 @@ +{ + "$schema": "https://opencode.ai/config.json", + "permission": { + "external_directory": "allow" + } +} diff --git a/ReforceXY/reward_space_analysis/README.md b/ReforceXY/reward_space_analysis/README.md index cb8be7c..c4a3dd2 100644 --- a/ReforceXY/reward_space_analysis/README.md +++ b/ReforceXY/reward_space_analysis/README.md @@ -115,7 +115,7 @@ Single-run example: ```shell uv run python reward_space_analysis.py \ --num_samples 30000 \ - --params win_reward_factor=4.0 idle_penalty_scale=1.5 \ + --params win_reward_factor=4.0 idle_penalty_ratio=1.5 \ --out_dir sensitivity_test ``` @@ -298,9 +298,9 @@ where `kernel_function` depends on `exit_attenuation_mode`. See [Exit Attenuatio | ---------------------------- | ------- | -------------------------- | | `max_trade_duration_candles` | 128 | Trade duration cap | | `max_idle_duration_candles` | None | Fallback 4× trade duration | -| `idle_penalty_scale` | 1.0 | Idle penalty scale | +| `idle_penalty_ratio` | 1.0 | Idle penalty ratio | | `idle_penalty_power` | 1.025 | Idle penalty exponent | -| `hold_penalty_scale` | 1.0 | Hold penalty scale | +| `hold_penalty_ratio` | 1.0 | Hold penalty ratio | | `hold_penalty_power` | 1.025 | Hold penalty exponent | #### Validation @@ -447,8 +447,8 @@ Direct flags and `--params` produce identical outcomes; conflicts resolved by bulk `--params` values. ```shell -uv run python reward_space_analysis.py --win_reward_factor 3.0 --idle_penalty_scale 2.0 --num_samples 15000 -uv run python reward_space_analysis.py --params win_reward_factor=3.0 idle_penalty_scale=2.0 --num_samples 15000 +uv run python reward_space_analysis.py --win_reward_factor 3.0 --idle_penalty_ratio 2.0 --num_samples 15000 +uv run python reward_space_analysis.py --params win_reward_factor=3.0 idle_penalty_ratio=2.0 --num_samples 15000 ``` `--params` wins on conflicts. diff --git a/ReforceXY/reward_space_analysis/reward_space_analysis.py b/ReforceXY/reward_space_analysis/reward_space_analysis.py index 9a78b1f..eee595d 100644 --- a/ReforceXY/reward_space_analysis/reward_space_analysis.py +++ b/ReforceXY/reward_space_analysis/reward_space_analysis.py @@ -117,13 +117,13 @@ DEFAULT_MODEL_REWARD_PARAMETERS: RewardParams = { "invalid_action": -2.0, "base_factor": 100.0, # Idle penalty defaults - "idle_penalty_scale": 1.0, + "idle_penalty_ratio": 1.0, "idle_penalty_power": 1.025, "max_trade_duration_candles": 128, # Fallback: DEFAULT_IDLE_DURATION_MULTIPLIER * max_trade_duration_candles "max_idle_duration_candles": None, # Hold penalty defaults - "hold_penalty_scale": 1.0, + "hold_penalty_ratio": 1.0, "hold_penalty_power": 1.025, # Exit attenuation defaults "exit_attenuation_mode": "linear", @@ -174,10 +174,10 @@ DEFAULT_MODEL_REWARD_PARAMETERS_HELP: Dict[str, str] = { "invalid_action": "Penalty for invalid actions", "base_factor": "Base reward scale", "idle_penalty_power": "Idle penalty exponent", - "idle_penalty_scale": "Idle penalty scale", + "idle_penalty_ratio": "Idle penalty ratio", "max_trade_duration_candles": "Trade duration cap (candles)", "max_idle_duration_candles": "Idle duration cap (candles)", - "hold_penalty_scale": "Hold penalty scale", + "hold_penalty_ratio": "Hold penalty ratio", "hold_penalty_power": "Hold penalty exponent", "exit_attenuation_mode": "Exit kernel (legacy|sqrt|linear|power|half_life)", "exit_plateau": "Use plateau before attenuation", @@ -224,10 +224,10 @@ _PARAMETER_BOUNDS: Dict[str, Dict[str, float]] = { "invalid_action": {"max": 0.0}, # penalty should be <= 0 "base_factor": {"min": 0.0}, "idle_penalty_power": {"min": 0.0}, - "idle_penalty_scale": {"min": 0.0}, + "idle_penalty_ratio": {"min": 0.0}, "max_trade_duration_candles": {"min": 1.0}, "max_idle_duration_candles": {"min": 0.0}, - "hold_penalty_scale": {"min": 0.0}, + "hold_penalty_ratio": {"min": 0.0}, "hold_penalty_power": {"min": 0.0}, "exit_linear_slope": {"min": 0.0}, "exit_plateau_grace": {"min": 0.0}, @@ -630,7 +630,7 @@ def add_tunable_cli_args(parser: argparse.ArgumentParser) -> None: """Dynamically add CLI options for each tunable in DEFAULT_MODEL_REWARD_PARAMETERS. Rules: - - Use the same underscored names as option flags (e.g., --idle_penalty_scale). + - Use the same underscored names as option flags (e.g., --idle_penalty_ratio). - Defaults are None so only user-provided values override params. - For exit_attenuation_mode, enforce allowed choices (case-sensitive). - Skip keys already managed as top-level options (e.g., base_factor) to avoid duplicates. @@ -1095,10 +1095,10 @@ def _get_next_position( def _idle_penalty(context: RewardContext, idle_factor: float, params: RewardParams) -> float: """Compute idle penalty.""" - idle_penalty_scale = _get_float_param( + idle_penalty_ratio = _get_float_param( params, - "idle_penalty_scale", - DEFAULT_MODEL_REWARD_PARAMETERS.get("idle_penalty_scale", 1.0), + "idle_penalty_ratio", + DEFAULT_MODEL_REWARD_PARAMETERS.get("idle_penalty_ratio", 1.0), ) idle_penalty_power = _get_float_param( params, @@ -1107,15 +1107,15 @@ def _idle_penalty(context: RewardContext, idle_factor: float, params: RewardPara ) max_idle_duration_candles = get_max_idle_duration_candles(params) idle_duration_ratio = context.idle_duration / max(1, max_idle_duration_candles) - return -idle_factor * idle_penalty_scale * idle_duration_ratio**idle_penalty_power + return -idle_factor * idle_penalty_ratio * idle_duration_ratio**idle_penalty_power def _hold_penalty(context: RewardContext, hold_factor: float, params: RewardParams) -> float: """Compute hold penalty.""" - hold_penalty_scale = _get_float_param( + hold_penalty_ratio = _get_float_param( params, - "hold_penalty_scale", - DEFAULT_MODEL_REWARD_PARAMETERS.get("hold_penalty_scale", 1.0), + "hold_penalty_ratio", + DEFAULT_MODEL_REWARD_PARAMETERS.get("hold_penalty_ratio", 1.0), ) hold_penalty_power = _get_float_param( params, @@ -1132,7 +1132,7 @@ def _hold_penalty(context: RewardContext, hold_factor: float, params: RewardPara if duration_ratio < 1.0: return _fail_safely("hold_penalty_duration_ratio_lt_1") - return -hold_factor * hold_penalty_scale * (duration_ratio - 1.0) ** hold_penalty_power + return -hold_factor * hold_penalty_ratio * (duration_ratio - 1.0) ** hold_penalty_power def _compute_exit_reward( @@ -3586,7 +3586,7 @@ def build_argument_parser() -> argparse.ArgumentParser: nargs="*", default=[], metavar="KEY=VALUE", - help="Override reward parameters, e.g. hold_penalty_scale=0.5", + help="Override reward parameters, e.g. hold_penalty_ratio=0.5", ) # Dynamically add CLI options for all tunables add_tunable_cli_args(parser) diff --git a/ReforceXY/reward_space_analysis/tests/components/test_reward_components.py b/ReforceXY/reward_space_analysis/tests/components/test_reward_components.py index 0a3673e..9f0c214 100644 --- a/ReforceXY/reward_space_analysis/tests/components/test_reward_components.py +++ b/ReforceXY/reward_space_analysis/tests/components/test_reward_components.py @@ -727,10 +727,10 @@ class TestRewardComponents(RewardSpaceTestBase): if ratio is not None: self.assertAlmostEqualFloat(abs(ratio), 2.0, tolerance=0.2) - idle_penalty_scale = _get_float_param(params, "idle_penalty_scale", 1.0) + idle_penalty_ratio = _get_float_param(params, "idle_penalty_ratio", 1.0) idle_penalty_power = _get_float_param(params, "idle_penalty_power", 1.025) idle_factor = base_factor * (profit_aim / risk_reward_ratio) - observed_ratio = abs(br_mid.idle_penalty) / (idle_factor * idle_penalty_scale) + observed_ratio = abs(br_mid.idle_penalty) / (idle_factor * idle_penalty_ratio) if observed_ratio > 0: implied_max_idle_duration_candles = 120 / observed_ratio ** (1 / idle_penalty_power) tolerance = 0.05 * expected_max_idle_duration_candles diff --git a/ReforceXY/user_data/freqaimodels/ReforceXY.py b/ReforceXY/user_data/freqaimodels/ReforceXY.py index 0b85360..e88efb6 100644 --- a/ReforceXY/user_data/freqaimodels/ReforceXY.py +++ b/ReforceXY/user_data/freqaimodels/ReforceXY.py @@ -177,9 +177,9 @@ class ReforceXY(BaseReinforcementLearningModel): DEFAULT_EFFICIENCY_CENTER: Final[float] = 0.5 DEFAULT_INVALID_ACTION: Final[float] = -2.0 - DEFAULT_IDLE_PENALTY_SCALE: Final[float] = 1.0 + DEFAULT_IDLE_PENALTY_RATIO: Final[float] = 1.0 DEFAULT_IDLE_PENALTY_POWER: Final[float] = 1.025 - DEFAULT_HOLD_PENALTY_SCALE: Final[float] = 1.0 + DEFAULT_HOLD_PENALTY_RATIO: Final[float] = 1.0 DEFAULT_HOLD_PENALTY_POWER: Final[float] = 1.025 DEFAULT_CHECK_INVARIANTS: Final[bool] = True @@ -2861,9 +2861,9 @@ class MyRLEnv(Base5ActionRLEnv): and self._position == Positions.Neutral ): max_idle_duration = max(1, self.max_idle_duration_candles) - idle_penalty_scale = float( + idle_penalty_ratio = float( model_reward_parameters.get( - "idle_penalty_scale", ReforceXY.DEFAULT_IDLE_PENALTY_SCALE + "idle_penalty_ratio", ReforceXY.DEFAULT_IDLE_PENALTY_RATIO ) ) idle_penalty_power = float( @@ -2875,7 +2875,7 @@ class MyRLEnv(Base5ActionRLEnv): idle_duration_ratio = idle_duration / max(1, max_idle_duration) base_reward = ( -idle_factor - * idle_penalty_scale + * idle_penalty_ratio * idle_duration_ratio**idle_penalty_power ) self._last_idle_penalty = float(base_reward) @@ -2886,9 +2886,9 @@ class MyRLEnv(Base5ActionRLEnv): and self._position in (Positions.Short, Positions.Long) and action == Actions.Neutral.value ): - hold_penalty_scale = float( + hold_penalty_ratio = float( model_reward_parameters.get( - "hold_penalty_scale", ReforceXY.DEFAULT_HOLD_PENALTY_SCALE + "hold_penalty_ratio", ReforceXY.DEFAULT_HOLD_PENALTY_RATIO ) ) hold_penalty_power = float( @@ -2901,7 +2901,7 @@ class MyRLEnv(Base5ActionRLEnv): else: base_reward = ( -hold_factor - * hold_penalty_scale + * hold_penalty_ratio * (duration_ratio - 1.0) ** hold_penalty_power ) self._last_hold_penalty = float(base_reward)