From: Jérôme Benoit Date: Tue, 7 Oct 2025 22:09:17 +0000 (+0200) Subject: refactor(reforcexy): refine reward CLI API X-Git-Url: https://git.piment-noir.org/?a=commitdiff_plain;h=fade2fb1448291f0826f5a981a56b93235370827;p=freqai-strategies.git refactor(reforcexy): refine reward CLI API Signed-off-by: Jérôme Benoit --- diff --git a/ReforceXY/reward_space_analysis/README.md b/ReforceXY/reward_space_analysis/README.md index 5eaec89..7822f53 100644 --- a/ReforceXY/reward_space_analysis/README.md +++ b/ReforceXY/reward_space_analysis/README.md @@ -86,12 +86,11 @@ source .venv/bin/activate pip install pandas numpy scipy scikit-learn ``` -Whenever you need to run analyses or tests, activate the environment first: +Whenever you need to run analyses, activate the environment first and execute: ```shell source .venv/bin/activate python reward_space_analysis.py --num_samples 20000 --output reward_space_outputs -python test_reward_space_analysis.py ``` > Deactivate the environment with `deactivate` when you're done. @@ -204,14 +203,14 @@ None - all parameters have sensible defaults. **`--profit_target`** (float, default: 0.03) - Target profit threshold as decimal (e.g., 0.03 = 3%) -- Used for efficiency calculations and holding penalties +- Used for exit reward **`--risk_reward_ratio`** (float, default: 1.0) - Risk/reward ratio multiplier - Affects profit target adjustment in reward calculations -**`--holding_max_ratio`** (float, default: 2.5) +**`--max_duration_ratio`** (float, default: 2.5) - Multiple of max_trade_duration used for sampling trade/idle durations - Higher = more variety in duration scenarios @@ -289,7 +288,7 @@ effective_r = r - grace if exit_plateau and r > grace effective_r = r if not exit_plateau ``` -| Mode | Multiplier (applied to base_factor * pnl * pnl_factor * efficiency) | Monotonic ↓ | Notes | +| Mode | Multiplier (applied to base_factor * pnl * pnl_factor * efficiency_factor) | Monotonic ↓ | Notes | |------|---------------------------------------------------------------------|-------------|-------| | legacy | step: ×1.5 if r* ≤ 1 else ×0.5 | No | Historical discontinuity retained (not smoothed) | | sqrt | 1 / sqrt(1 + r*) | Yes | Sub-linear decay | @@ -299,11 +298,6 @@ effective_r = r if not exit_plateau Where r* = `effective_r` above. -Notes: -- Plateau guarantees continuity at the boundary r = grace for all monotonic kernels; only `legacy` may jump. -- A single implementation in code (`_get_exit_factor`) mirrors this table; this README is the canonical human-readable mapping. -- Continuity tests assert small‑epsilon bounded attenuation onset (excluding `legacy`). - _Efficiency configuration:_ - `efficiency_weight` (default: 1.0) - Weight for efficiency factor in exit reward @@ -515,10 +509,13 @@ done ### Run Tests ```shell -python test_reward_space_analysis.py +# activate the venv first +source .venv/bin/activate +pip install pytest packaging +pytest -q ``` -The suite currently contains 59 tests (current state; this number evolves as new invariants and attenuation modes are added). Always run the full suite after modifying reward logic or attenuation parameters. +Always run the full suite after modifying reward logic or attenuation parameters. ### Test Categories diff --git a/ReforceXY/reward_space_analysis/reward_space_analysis.py b/ReforceXY/reward_space_analysis/reward_space_analysis.py index eae0fd9..865c9fa 100644 --- a/ReforceXY/reward_space_analysis/reward_space_analysis.py +++ b/ReforceXY/reward_space_analysis/reward_space_analysis.py @@ -698,7 +698,7 @@ def simulate_samples( base_factor: float, profit_target: float, risk_reward_ratio: float, - holding_max_ratio: float, + max_duration_ratio: float, trading_mode: str, pnl_base_std: float, pnl_duration_vol_scale: float, @@ -737,9 +737,25 @@ def simulate_samples( if position == Positions.Neutral: trade_duration = 0 - idle_duration = int(rng.uniform(0, max_trade_duration * holding_max_ratio)) + max_idle_duration_candles = params.get("max_idle_duration_candles") + try: + if max_idle_duration_candles is not None: + max_idle_duration_candles = int(max_idle_duration_candles) + else: + max_idle_duration_candles = int( + max_trade_duration * max_duration_ratio + ) + except (TypeError, ValueError): + max_idle_duration_candles = int(max_trade_duration * max_duration_ratio) + + if max_idle_duration_candles <= 0: + max_idle_duration_candles = int(max_trade_duration * max_duration_ratio) + + idle_duration = int(rng.uniform(0, max_idle_duration_candles)) else: - trade_duration = int(rng.uniform(1, max_trade_duration * holding_max_ratio)) + trade_duration = int( + rng.uniform(1, max_trade_duration * max_duration_ratio) + ) trade_duration = max(1, trade_duration) idle_duration = 0 @@ -1980,7 +1996,7 @@ def build_argument_parser() -> argparse.ArgumentParser: help="Risk reward ratio multiplier (default: 1.0).", ) parser.add_argument( - "--holding_max_ratio", + "--max_duration_ratio", type=float, default=2.5, help="Multiple of max duration used when sampling trade/idle durations.", @@ -2536,7 +2552,7 @@ def main() -> None: base_factor=base_factor, profit_target=profit_target, risk_reward_ratio=risk_reward_ratio, - holding_max_ratio=args.holding_max_ratio, + max_duration_ratio=args.max_duration_ratio, trading_mode=args.trading_mode, pnl_base_std=args.pnl_base_std, pnl_duration_vol_scale=args.pnl_duration_vol_scale, @@ -2549,7 +2565,7 @@ def main() -> None: "base_factor": base_factor, "profit_target": profit_target, "risk_reward_ratio": risk_reward_ratio, - "holding_max_ratio": args.holding_max_ratio, + "max_duration_ratio": args.max_duration_ratio, "trading_mode": args.trading_mode, "action_masking": params.get("action_masking", True), "pnl_base_std": args.pnl_base_std, diff --git a/ReforceXY/reward_space_analysis/test_reward_space_analysis.py b/ReforceXY/reward_space_analysis/test_reward_space_analysis.py index 460f4ed..2bd0918 100644 --- a/ReforceXY/reward_space_analysis/test_reward_space_analysis.py +++ b/ReforceXY/reward_space_analysis/test_reward_space_analysis.py @@ -1252,7 +1252,7 @@ class TestStatisticalValidation(RewardSpaceTestBase): base_factor=TEST_BASE_FACTOR, profit_target=TEST_PROFIT_TARGET, risk_reward_ratio=TEST_RR, - holding_max_ratio=2.0, + max_duration_ratio=2.0, trading_mode="margin", pnl_base_std=TEST_PNL_STD, pnl_duration_vol_scale=TEST_PNL_DUR_VOL_SCALE, @@ -1369,7 +1369,7 @@ class TestStatisticalValidation(RewardSpaceTestBase): base_factor=TEST_BASE_FACTOR, profit_target=TEST_PROFIT_TARGET, risk_reward_ratio=1.0, - holding_max_ratio=2.0, + max_duration_ratio=2.0, trading_mode="margin", pnl_base_std=TEST_PNL_STD, pnl_duration_vol_scale=TEST_PNL_DUR_VOL_SCALE, @@ -1602,7 +1602,7 @@ class TestStatisticalValidation(RewardSpaceTestBase): base_factor=TEST_BASE_FACTOR, profit_target=TEST_PROFIT_TARGET, risk_reward_ratio=1.0, - holding_max_ratio=2.0, + max_duration_ratio=2.0, trading_mode="margin", pnl_base_std=TEST_PNL_STD, pnl_duration_vol_scale=TEST_PNL_DUR_VOL_SCALE, @@ -1663,7 +1663,7 @@ class TestStatisticalValidation(RewardSpaceTestBase): base_factor=TEST_BASE_FACTOR, profit_target=TEST_PROFIT_TARGET, risk_reward_ratio=TEST_RR, - holding_max_ratio=2.0, + max_duration_ratio=2.0, trading_mode="spot", pnl_base_std=0.02, pnl_duration_vol_scale=0.5, @@ -1684,7 +1684,7 @@ class TestStatisticalValidation(RewardSpaceTestBase): base_factor=100.0, profit_target=0.03, risk_reward_ratio=1.0, - holding_max_ratio=2.0, + max_duration_ratio=2.0, trading_mode="margin", pnl_base_std=0.02, pnl_duration_vol_scale=0.5, @@ -1867,7 +1867,7 @@ class TestHelperFunctions(RewardSpaceTestBase): base_factor=100.0, profit_target=0.03, risk_reward_ratio=1.0, - holding_max_ratio=2.0, + max_duration_ratio=2.0, trading_mode="spot", pnl_base_std=0.02, pnl_duration_vol_scale=0.5, @@ -1882,7 +1882,7 @@ class TestHelperFunctions(RewardSpaceTestBase): base_factor=100.0, profit_target=0.03, risk_reward_ratio=1.0, - holding_max_ratio=2.0, + max_duration_ratio=2.0, trading_mode="spot", pnl_base_std=0.02, pnl_duration_vol_scale=0.5, @@ -1900,7 +1900,7 @@ class TestHelperFunctions(RewardSpaceTestBase): base_factor=100.0, profit_target=0.03, risk_reward_ratio=1.0, - holding_max_ratio=2.0, + max_duration_ratio=2.0, trading_mode="futures", pnl_base_std=0.02, pnl_duration_vol_scale=0.5, @@ -1925,7 +1925,7 @@ class TestHelperFunctions(RewardSpaceTestBase): base_factor=100.0, profit_target=0.03, risk_reward_ratio=1.0, - holding_max_ratio=2.0, + max_duration_ratio=2.0, trading_mode="spot", pnl_base_std=0.02, pnl_duration_vol_scale=0.5, @@ -1959,7 +1959,7 @@ class TestHelperFunctions(RewardSpaceTestBase): base_factor=100.0, profit_target=0.03, risk_reward_ratio=1.0, - holding_max_ratio=2.0, + max_duration_ratio=2.0, trading_mode="spot", pnl_base_std=0.02, pnl_duration_vol_scale=0.5, @@ -2084,7 +2084,7 @@ class TestHelperFunctions(RewardSpaceTestBase): base_factor=100.0, profit_target=0.03, risk_reward_ratio=1.0, - holding_max_ratio=2.0, + max_duration_ratio=2.0, trading_mode="margin", pnl_base_std=0.02, pnl_duration_vol_scale=0.5,