From: Jérôme Benoit Date: Sat, 27 Dec 2025 01:12:16 +0000 (+0100) Subject: refactor(ReforceXY): improve code readability and maintainability X-Git-Url: https://git.piment-noir.org/?a=commitdiff_plain;h=f2a5e5c1116f081f033dade85017d7e842262425;p=freqai-strategies.git refactor(ReforceXY): improve code readability and maintainability Signed-off-by: Jérôme Benoit --- diff --git a/ReforceXY/reward_space_analysis/reward_space_analysis.py b/ReforceXY/reward_space_analysis/reward_space_analysis.py index 3cc3b11..94a9d55 100644 --- a/ReforceXY/reward_space_analysis/reward_space_analysis.py +++ b/ReforceXY/reward_space_analysis/reward_space_analysis.py @@ -756,9 +756,15 @@ def add_tunable_cli_args(parser: argparse.ArgumentParser) -> None: @dataclasses.dataclass class RewardContext: - """Context for reward computation.""" + """Context for reward computation. - pnl: float + Attributes + ---------- + current_pnl : float + Unrealized PnL at the current tick (state s'). + """ + + current_pnl: float trade_duration: int idle_duration: int max_unrealized_profit: float @@ -1039,6 +1045,11 @@ def _compute_efficiency_coefficient( range_pnl = max_pnl - min_pnl if np.isfinite(range_pnl) and not np.isclose(range_pnl, 0.0): efficiency_ratio = (pnl - min_pnl) / range_pnl + # For profits (pnl > 0): high ratio = good exit → higher coefficient → amplify gain + # For losses (pnl < 0): high ratio = good exit → LOWER coefficient → attenuate penalty + # The sign inversion for losses ensures reward = pnl * coef behaves correctly: + # - Good loss exit: small |pnl| * low coef = small penalty + # - Bad loss exit: large |pnl| * high coef = large penalty if pnl > 0.0: efficiency_coefficient = 1.0 + efficiency_weight * ( efficiency_ratio - efficiency_center @@ -1145,9 +1156,15 @@ def _compute_exit_reward( float: Exit reward (pnl * exit_factor) """ exit_factor = _get_exit_factor( - base_factor, context.pnl, pnl_target, duration_ratio, context, params, risk_reward_ratio + base_factor, + context.current_pnl, + pnl_target, + duration_ratio, + context, + params, + risk_reward_ratio, ) - return context.pnl * exit_factor + return context.current_pnl * exit_factor def calculate_reward( @@ -1225,7 +1242,7 @@ def calculate_reward( breakdown.base_reward = float(base_reward) # === PBRS INTEGRATION === - current_pnl = context.pnl if context.position != Positions.Neutral else 0.0 + current_pnl = context.current_pnl if context.position != Positions.Neutral else 0.0 next_position = _get_next_position( context.position, context.action, short_allowed=short_allowed @@ -1582,7 +1599,7 @@ def simulate_samples( ) context = RewardContext( - pnl=pnl, + current_pnl=pnl, trade_duration=trade_duration, idle_duration=idle_duration, max_unrealized_profit=max_unrealized_profit, @@ -1606,7 +1623,7 @@ def simulate_samples( idle_ratio = context.idle_duration / max(1, max_idle_duration_candles) samples.append( { - "pnl": context.pnl, + "pnl": context.current_pnl, "trade_duration": context.trade_duration, "idle_duration": context.idle_duration, "duration_ratio": _compute_duration_ratio( diff --git a/ReforceXY/reward_space_analysis/tests/components/test_reward_components.py b/ReforceXY/reward_space_analysis/tests/components/test_reward_components.py index 496fdfa..f5410d6 100644 --- a/ReforceXY/reward_space_analysis/tests/components/test_reward_components.py +++ b/ReforceXY/reward_space_analysis/tests/components/test_reward_components.py @@ -18,7 +18,7 @@ from reward_space_analysis import ( get_max_idle_duration_candles, ) -from ..constants import PARAMS, SCENARIOS, TOLERANCE +from ..constants import EFFICIENCY, PARAMS, SCENARIOS, TOLERANCE from ..helpers import ( RewardScenarioConfig, ThresholdTestConfig, @@ -301,110 +301,189 @@ class TestRewardComponents(RewardSpaceTestBase): action=Actions.Long_exit, ) - coefficient = _compute_efficiency_coefficient(params, ctx, ctx.pnl) + coefficient = _compute_efficiency_coefficient(params, ctx, ctx.current_pnl) self.assertFinite(coefficient, name="efficiency_coefficient") self.assertAlmostEqualFloat(coefficient, 1.0, tolerance=TOLERANCE.GENERIC_EQ) - def test_efficiency_coefficient_optimal_profit_exit(self): - """Efficiency coefficient rewards exits near peak unrealized profit. - - Validates that exiting close to maximum unrealized profit produces - coefficient > 1.0, incentivizing optimal exit timing for profitable trades. + def test_efficiency_coefficient_profits_monotonic_with_exact_bounds(self): + """Verify efficiency coefficient monotonicity for profitable trades. **Setup:** - - PnL: 0.029 (very close to max_unrealized_profit=0.03) - - Efficiency ratio: (0.029 - 0.0) / (0.03 - 0.0) ≈ 0.967 (high) - - efficiency_weight: 1.0, efficiency_center: 0.5 - - Trade context: Long position exiting near peak + - efficiency_weight: EFFICIENCY.WEIGHT_DEFAULT (1.0) + - efficiency_center: EFFICIENCY.CENTER_DEFAULT (0.5) + - PnL range: EFFICIENCY.PNL_RANGE_PROFIT (6 test points) + - Unrealized range: [0.0, EFFICIENCY.MAX_UNREALIZED_PROFIT] **Assertions:** - - Coefficient is finite - - Coefficient > 1.0 (rewards optimal timing) + - Strict monotonicity (non-decreasing) as exit quality improves + - Exact coefficient values at bounds match formula: 1 + weight*(ratio - center) + - Poor exit: coefficient < 1.0, Optimal exit: coefficient > 1.0 """ - params = self.base_params(efficiency_weight=1.0, efficiency_center=0.5) - ctx = self.make_ctx( - pnl=0.029, # Close to max - trade_duration=10, - max_unrealized_profit=0.03, - min_unrealized_profit=0.0, - position=Positions.Long, - action=Actions.Long_exit, + params = self.base_params( + efficiency_weight=EFFICIENCY.WEIGHT_DEFAULT, + efficiency_center=EFFICIENCY.CENTER_DEFAULT, ) + max_unrealized_profit = EFFICIENCY.MAX_UNREALIZED_PROFIT + min_unrealized_profit = 0.0 - coefficient = _compute_efficiency_coefficient(params, ctx, ctx.pnl) + pnl_values = list(EFFICIENCY.PNL_RANGE_PROFIT) + coefficients = [] - self.assertFinite(coefficient, name="efficiency_coefficient") - self.assertGreater( - coefficient, 1.0, "Exit near max profit should reward with coefficient > 1.0" + for pnl in pnl_values: + ctx = self.make_ctx( + pnl=pnl, + trade_duration=EFFICIENCY.TRADE_DURATION_DEFAULT, + max_unrealized_profit=max_unrealized_profit, + min_unrealized_profit=min_unrealized_profit, + position=Positions.Long, + action=Actions.Long_exit, + ) + coefficient = _compute_efficiency_coefficient(params, ctx, ctx.current_pnl) + self.assertFinite(coefficient, name=f"efficiency_coefficient[pnl={pnl}]") + coefficients.append(coefficient) + + # Verify strict monotonicity + self.assertMonotonic( + coefficients, + non_decreasing=True, + tolerance=TOLERANCE.IDENTITY_STRICT, + name="efficiency_coefficient_for_profits", ) - def test_efficiency_coefficient_poor_profit_exit(self): - """Efficiency coefficient penalizes exits far from peak unrealized profit. + # Verify exact values at bounds using the formula + # coefficient = 1.0 + weight * (ratio - center) + # ratio = (pnl - min_pnl) / range_pnl + range_pnl = max_unrealized_profit - min_unrealized_profit + + # Poor exit bound (first element) + pnl_poor = pnl_values[0] + expected_ratio_poor = (pnl_poor - min_unrealized_profit) / range_pnl + expected_coef_poor = 1.0 + EFFICIENCY.WEIGHT_DEFAULT * ( + expected_ratio_poor - EFFICIENCY.CENTER_DEFAULT + ) + self.assertAlmostEqualFloat( + coefficients[0], + expected_coef_poor, + tolerance=TOLERANCE.GENERIC_EQ, + msg=f"Poor exit coefficient {coefficients[0]:.4f} != expected {expected_coef_poor:.4f}", + ) + self.assertLess(coefficients[0], 1.0, "Poor profit exit should have coefficient < 1.0") - Validates that exiting far below maximum unrealized profit produces - coefficient < 1.0, penalizing poor exit timing that leaves profit on the table. + # Optimal exit bound (last element) + pnl_optimal = pnl_values[-1] + expected_ratio_optimal = (pnl_optimal - min_unrealized_profit) / range_pnl + expected_coef_optimal = 1.0 + EFFICIENCY.WEIGHT_DEFAULT * ( + expected_ratio_optimal - EFFICIENCY.CENTER_DEFAULT + ) + self.assertAlmostEqualFloat( + coefficients[-1], + expected_coef_optimal, + tolerance=TOLERANCE.GENERIC_EQ, + msg=f"Optimal exit coefficient {coefficients[-1]:.4f} != expected {expected_coef_optimal:.4f}", + ) + self.assertGreater( + coefficients[-1], 1.0, "Optimal profit exit should have coefficient > 1.0" + ) + + def test_efficiency_coefficient_losses_monotonic_with_exact_bounds(self): + """Verify efficiency coefficient behavior for losing trades. **Setup:** - - PnL: 0.005 (far from max_unrealized_profit=0.03) - - Efficiency ratio: (0.005 - 0.0) / (0.03 - 0.0) ≈ 0.167 (low) - - efficiency_weight: 1.0, efficiency_center: 0.5 - - Trade context: Long position exiting prematurely + - efficiency_weight: EFFICIENCY.WEIGHT_DEFAULT (1.0) + - efficiency_center: EFFICIENCY.CENTER_DEFAULT (0.5) + - PnL range: EFFICIENCY.PNL_RANGE_LOSS (7 test points, worst to best) + - Unrealized range: [EFFICIENCY.MIN_UNREALIZED_PROFIT, 0.0] **Assertions:** - - Coefficient is finite - - Coefficient < 1.0 (penalizes suboptimal timing) + - Coefficient DECREASES as exit quality improves (inverted formula) + - Exact values at bounds match: 1 + weight*(center - ratio) + - Reward (pnl * coef) is less negative for better exits """ - params = self.base_params(efficiency_weight=1.0, efficiency_center=0.5) - ctx = self.make_ctx( - pnl=0.005, # Far from max - trade_duration=10, - max_unrealized_profit=0.03, - min_unrealized_profit=0.0, - position=Positions.Long, - action=Actions.Long_exit, - ) - - coefficient = _compute_efficiency_coefficient(params, ctx, ctx.pnl) - - self.assertFinite(coefficient, name="efficiency_coefficient") - self.assertLess( - coefficient, 1.0, "Exit far from max profit should penalize with coefficient < 1.0" + params = self.base_params( + efficiency_weight=EFFICIENCY.WEIGHT_DEFAULT, + efficiency_center=EFFICIENCY.CENTER_DEFAULT, ) + max_unrealized_profit = 0.0 + min_unrealized_profit = EFFICIENCY.MIN_UNREALIZED_PROFIT - def test_efficiency_coefficient_optimal_loss_exit(self): - """Efficiency coefficient rewards loss exits near minimum unrealized loss. + pnl_values = list(EFFICIENCY.PNL_RANGE_LOSS) + coefficients = [] + rewards = [] - Validates that exiting close to minimum unrealized loss produces - coefficient > 1.0, rewarding quick loss-cutting behavior for losing trades. + for pnl in pnl_values: + ctx = self.make_ctx( + pnl=pnl, + trade_duration=EFFICIENCY.TRADE_DURATION_DEFAULT, + max_unrealized_profit=max_unrealized_profit, + min_unrealized_profit=min_unrealized_profit, + position=Positions.Long, + action=Actions.Long_exit, + ) + coefficient = _compute_efficiency_coefficient(params, ctx, ctx.current_pnl) + self.assertFinite(coefficient, name=f"efficiency_coefficient[pnl={pnl}]") + coefficients.append(coefficient) + # Simplified reward calculation (ignoring other factors for this test) + rewards.append(pnl * coefficient) - **Setup:** - - PnL: -0.005 (very close to min_unrealized_profit=-0.006) - - Efficiency ratio: (-0.005 - (-0.006)) / (0.0 - (-0.006)) ≈ 0.167 (low) - - For losses: coefficient = 1 + weight * (center - ratio) → rewards low ratio - - efficiency_weight: 1.0, efficiency_center: 0.5 - - Trade context: Long position cutting losses quickly + # Verify coefficient DECREASES as exit quality improves (monotonically decreasing) + self.assertMonotonic( + coefficients, + non_increasing=True, # Decreasing for losses! + tolerance=TOLERANCE.IDENTITY_STRICT, + name="efficiency_coefficient_for_losses", + ) - **Assertions:** - - Coefficient is finite - - Coefficient > 1.0 (rewards optimal loss exit) - """ - params = self.base_params(efficiency_weight=1.0, efficiency_center=0.5) - ctx = self.make_ctx( - pnl=-0.005, # Close to min loss - trade_duration=10, - max_unrealized_profit=0.0, - min_unrealized_profit=-0.006, - position=Positions.Long, - action=Actions.Long_exit, + # Verify reward INCREASES (less negative) as exit quality improves + self.assertMonotonic( + rewards, + non_decreasing=True, + tolerance=TOLERANCE.IDENTITY_STRICT, + name="exit_reward_for_losses", ) - coefficient = _compute_efficiency_coefficient(params, ctx, ctx.pnl) + # Verify exact values at bounds using the INVERTED formula for losses + # coefficient = 1.0 + weight * (center - ratio) + range_pnl = max_unrealized_profit - min_unrealized_profit - self.assertFinite(coefficient, name="efficiency_coefficient") + # Worst exit bound (first element: largest loss) + pnl_worst = pnl_values[0] + ratio_worst = (pnl_worst - min_unrealized_profit) / range_pnl + expected_coef_worst = 1.0 + EFFICIENCY.WEIGHT_DEFAULT * ( + EFFICIENCY.CENTER_DEFAULT - ratio_worst + ) + self.assertAlmostEqualFloat( + coefficients[0], + expected_coef_worst, + tolerance=TOLERANCE.GENERIC_EQ, + msg=f"Worst loss coefficient {coefficients[0]:.4f} != expected {expected_coef_worst:.4f}", + ) self.assertGreater( - coefficient, 1.0, "Exit near min loss should reward with coefficient > 1.0" + coefficients[0], + 1.0, + "Worst loss exit should have coefficient > 1.0 (amplifies penalty)", + ) + + # Optimal exit bound (last element: minimal loss) + pnl_optimal = pnl_values[-1] + ratio_optimal = (pnl_optimal - min_unrealized_profit) / range_pnl + expected_coef_optimal = 1.0 + EFFICIENCY.WEIGHT_DEFAULT * ( + EFFICIENCY.CENTER_DEFAULT - ratio_optimal + ) + self.assertAlmostEqualFloat( + coefficients[-1], + expected_coef_optimal, + tolerance=TOLERANCE.GENERIC_EQ, + msg=f"Minimal loss coefficient {coefficients[-1]:.4f} != expected {expected_coef_optimal:.4f}", ) + self.assertLess( + coefficients[-1], + 1.0, + "Minimal loss exit should have coefficient < 1.0 (attenuates penalty)", + ) + + # Verify the final reward semantics: better exit = less negative reward + self.assertLess(rewards[0], rewards[-1], "Worst exit should have more negative reward") def test_exit_reward_never_positive_for_loss_due_to_efficiency(self): """Exit reward should not become positive for a loss trade. diff --git a/ReforceXY/reward_space_analysis/tests/constants.py b/ReforceXY/reward_space_analysis/tests/constants.py index 8db775b..ba564e7 100644 --- a/ReforceXY/reward_space_analysis/tests/constants.py +++ b/ReforceXY/reward_space_analysis/tests/constants.py @@ -100,6 +100,40 @@ class ExitFactorConfig: MIN_POWER_TAU: float = 1e-15 +@dataclass(frozen=True) +class EfficiencyConfig: + """Efficiency coefficient testing configuration. + + Configuration for exit timing efficiency coefficient validation, including + the formula parameters and standard test values. + + The efficiency coefficient modifies exit rewards based on how well the agent + timed its exit relative to unrealized PnL extremes during the trade. + + Formula: + For profits: coefficient = 1.0 + weight * (ratio - center) + For losses: coefficient = 1.0 + weight * (center - ratio) [inverted] + Where: ratio = (pnl - min_unrealized) / (max_unrealized - min_unrealized) + + Attributes: + WEIGHT_DEFAULT: Default efficiency_weight parameter (1.0) + CENTER_DEFAULT: Default efficiency_center parameter (0.5) + MAX_UNREALIZED_PROFIT: Standard max unrealized profit for profit tests (0.03) + MIN_UNREALIZED_PROFIT: Standard min unrealized profit for loss tests (-0.03) + PNL_RANGE_PROFIT: Standard PnL range for profit tests: (min, max) tuple + PNL_RANGE_LOSS: Standard PnL range for loss tests: (min, max) tuple + TRADE_DURATION_DEFAULT: Default trade duration for efficiency tests (10) + """ + + WEIGHT_DEFAULT: float = 1.0 + CENTER_DEFAULT: float = 0.5 + MAX_UNREALIZED_PROFIT: float = 0.03 + MIN_UNREALIZED_PROFIT: float = -0.03 + PNL_RANGE_PROFIT: tuple[float, ...] = (0.005, 0.010, 0.015, 0.020, 0.025, 0.029) + PNL_RANGE_LOSS: tuple[float, ...] = (-0.029, -0.025, -0.020, -0.015, -0.010, -0.005, -0.001) + TRADE_DURATION_DEFAULT: int = 10 + + @dataclass(frozen=True) class PBRSConfig: """Potential-Based Reward Shaping (PBRS) configuration. @@ -398,6 +432,7 @@ class StatisticalTolerances: # Global singleton instances for easy import TOLERANCE: Final[ToleranceConfig] = ToleranceConfig() CONTINUITY: Final[ContinuityConfig] = ContinuityConfig() +EFFICIENCY: Final[EfficiencyConfig] = EfficiencyConfig() EXIT_FACTOR: Final[ExitFactorConfig] = ExitFactorConfig() PBRS: Final[PBRSConfig] = PBRSConfig() STATISTICAL: Final[StatisticalConfig] = StatisticalConfig() @@ -409,6 +444,7 @@ STAT_TOL: Final[StatisticalTolerances] = StatisticalTolerances() __all__ = [ "CONTINUITY", + "EFFICIENCY", "EXIT_FACTOR", "PARAMS", "PBRS", @@ -418,6 +454,7 @@ __all__ = [ "STAT_TOL", "TOLERANCE", "ContinuityConfig", + "EfficiencyConfig", "ExitFactorConfig", "PBRSConfig", "StatisticalConfig", diff --git a/ReforceXY/reward_space_analysis/tests/helpers/assertions.py b/ReforceXY/reward_space_analysis/tests/helpers/assertions.py index 76b6cc1..25c1b49 100644 --- a/ReforceXY/reward_space_analysis/tests/helpers/assertions.py +++ b/ReforceXY/reward_space_analysis/tests/helpers/assertions.py @@ -666,13 +666,13 @@ def assert_exit_mode_mathematical_validation( ) pnl_target = profit_aim * risk_reward_ratio pnl_target_coefficient = _compute_pnl_target_coefficient( - params, context.pnl, pnl_target, risk_reward_ratio + params, context.current_pnl, pnl_target, risk_reward_ratio ) - efficiency_coefficient = _compute_efficiency_coefficient(params, context, context.pnl) + efficiency_coefficient = _compute_efficiency_coefficient(params, context, context.current_pnl) observed_exit_factor = _get_exit_factor( base_factor, - context.pnl, + context.current_pnl, pnl_target, duration_ratio, context, diff --git a/ReforceXY/reward_space_analysis/tests/pbrs/test_pbrs.py b/ReforceXY/reward_space_analysis/tests/pbrs/test_pbrs.py index 85dc8e2..cbbc3e5 100644 --- a/ReforceXY/reward_space_analysis/tests/pbrs/test_pbrs.py +++ b/ReforceXY/reward_space_analysis/tests/pbrs/test_pbrs.py @@ -748,7 +748,7 @@ class TestPBRS(RewardSpaceTestBase): ) expected_next_potential = _compute_hold_potential( - pnl=ctx.pnl, + pnl=ctx.current_pnl, pnl_target=PARAMS.PROFIT_AIM * PARAMS.RISK_REWARD_RATIO, duration_ratio=(trade_duration / max_trade_duration_candles), risk_reward_ratio=PARAMS.RISK_REWARD_RATIO, @@ -958,7 +958,7 @@ class TestPBRS(RewardSpaceTestBase): current_duration_ratio = ctx.trade_duration / params["max_trade_duration_candles"] prev_potential = _compute_hold_potential( - ctx.pnl, + ctx.current_pnl, pnl_target, current_duration_ratio, PARAMS.RISK_REWARD_RATIO, diff --git a/ReforceXY/reward_space_analysis/tests/statistics/test_statistics.py b/ReforceXY/reward_space_analysis/tests/statistics/test_statistics.py index 7487bfb..e0c1db2 100644 --- a/ReforceXY/reward_space_analysis/tests/statistics/test_statistics.py +++ b/ReforceXY/reward_space_analysis/tests/statistics/test_statistics.py @@ -171,8 +171,8 @@ class TestStatistics(RewardSpaceTestBase): if len(df) > 30: idle_data = df[df["idle_duration"] > 0] if len(idle_data) > 10: - idle_dur = np.asarray(idle_data["idle_duration"], dtype=float) - idle_rew = np.asarray(idle_data["reward_idle"], dtype=float) + idle_dur = idle_data["idle_duration"].to_numpy(dtype=float) + idle_rew = idle_data["reward_idle"].to_numpy(dtype=float) self.assertTrue( len(idle_dur) == len(idle_rew), "Idle duration and reward arrays should have same length", diff --git a/ReforceXY/reward_space_analysis/tests/test_base.py b/ReforceXY/reward_space_analysis/tests/test_base.py index 86f3fd0..4bb2698 100644 --- a/ReforceXY/reward_space_analysis/tests/test_base.py +++ b/ReforceXY/reward_space_analysis/tests/test_base.py @@ -45,7 +45,7 @@ def make_ctx( ) -> RewardContext: """Create a RewardContext with neutral defaults.""" return RewardContext( - pnl=pnl, + current_pnl=pnl, trade_duration=trade_duration, idle_duration=idle_duration, max_unrealized_profit=max_unrealized_profit, diff --git a/ReforceXY/user_data/freqaimodels/ReforceXY.py b/ReforceXY/user_data/freqaimodels/ReforceXY.py index 8974741..ebc7afd 100644 --- a/ReforceXY/user_data/freqaimodels/ReforceXY.py +++ b/ReforceXY/user_data/freqaimodels/ReforceXY.py @@ -1372,13 +1372,13 @@ class ReforceXY(BaseReinforcementLearningModel): ) hyperopt_failed = True time_spent = time.time() - start_time - n_complete = len([t for t in study.trials if t.state == TrialState.COMPLETE]) + n_completed = len([t for t in study.trials if t.state == TrialState.COMPLETE]) n_pruned = len([t for t in study.trials if t.state == TrialState.PRUNED]) n_failed = len([t for t in study.trials if t.state == TrialState.FAIL]) logger.info( - "Hyperopt %s: %s complete, %s pruned, %s failed trials", + "Hyperopt %s: %s completed, %s pruned, %s failed trials", study_name, - n_complete, + n_completed, n_pruned, n_failed, ) @@ -1975,31 +1975,46 @@ class MyRLEnv(Base5ActionRLEnv): self, action: int, trade_duration: float, - pnl: float, + current_pnl: float, ) -> Tuple[Positions, int, float]: - """Compute next transition state tuple.""" + """Compute next transition state tuple (next_position, next_duration, next_pnl). + + Parameters + ---------- + action : int + Action taken by the agent. + trade_duration : float + Trade duration at current tick. + current_pnl : float + Unrealized PnL at current tick. + + Returns + ------- + tuple[Positions, int, float] + (next_position, next_trade_duration, next_pnl) for the transition s -> s'. + """ next_position = self._get_next_position(action) - # Entry + # Entry: Neutral -> Long/Short if self._position == Positions.Neutral and next_position in ( Positions.Long, Positions.Short, ): return next_position, 0, self._get_entry_unrealized_profit(next_position) - # Exit + # Exit: Long/Short -> Neutral if ( self._position in (Positions.Long, Positions.Short) and next_position == Positions.Neutral ): return next_position, 0, 0.0 - # Hold + # Hold: Long/Short -> Long/Short if self._position in (Positions.Long, Positions.Short) and next_position in ( Positions.Long, Positions.Short, ): - return next_position, int(trade_duration), pnl + return next_position, int(trade_duration), current_pnl # Neutral self-loop return next_position, 0, 0.0 @@ -2258,7 +2273,7 @@ class MyRLEnv(Base5ActionRLEnv): action: int, trade_duration: float, max_trade_duration: float, - pnl: float, + current_pnl: float, pnl_target: float, hold_potential_scale: float, entry_additive_scale: float, @@ -2395,11 +2410,13 @@ class MyRLEnv(Base5ActionRLEnv): action : int Action taken: determines transition type (entry/hold/exit) trade_duration : float - Current trade duration in candles (for current state s) + Trade duration at current tick. + This is the duration for state s'. max_trade_duration : float Maximum allowed trade duration (for normalization) - pnl : float - Current position PnL (for current state s) + current_pnl : float + Unrealized PnL at current tick. + This is the PnL for state s'. pnl_target : float Target PnL for ratio normalization: r_pnl = pnl / pnl_target hold_potential_scale : float @@ -2449,7 +2466,7 @@ class MyRLEnv(Base5ActionRLEnv): return 0.0, 0.0, 0.0 next_position, next_trade_duration, next_pnl = self._get_next_transition_state( - action=action, trade_duration=trade_duration, pnl=pnl + action=action, trade_duration=trade_duration, current_pnl=current_pnl ) if max_trade_duration <= 0: next_duration_ratio = 0.0 @@ -2520,7 +2537,7 @@ class MyRLEnv(Base5ActionRLEnv): if self._exit_additive_enabled and not self.is_pbrs_invariant_mode(): duration_ratio = trade_duration / max(1, max_trade_duration) exit_additive = self._compute_exit_additive( - pnl, pnl_target, duration_ratio, exit_additive_scale + current_pnl, pnl_target, duration_ratio, exit_additive_scale ) self._total_exit_additive += float(exit_additive) @@ -2988,7 +3005,7 @@ class MyRLEnv(Base5ActionRLEnv): action=action, trade_duration=trade_duration, max_trade_duration=max_trade_duration, - pnl=pnl, + current_pnl=pnl, pnl_target=self._pnl_target, hold_potential_scale=hold_potential_scale, entry_additive_scale=entry_additive_scale,