From: Jérôme Benoit <jerome.benoit@piment-noir.org>
Date: Sat, 27 Dec 2025 01:12:16 +0000 (+0100)
Subject: refactor(ReforceXY): improve code readability and maintainability
X-Git-Url: https://git.piment-noir.org/?a=commitdiff_plain;h=f2a5e5c1116f081f033dade85017d7e842262425;p=freqai-strategies.git

refactor(ReforceXY): improve code readability and maintainability

Signed-off-by: Jérôme Benoit <jerome.benoit@piment-noir.org>
---

diff --git a/ReforceXY/reward_space_analysis/reward_space_analysis.py b/ReforceXY/reward_space_analysis/reward_space_analysis.py
index 3cc3b11..94a9d55 100644
--- a/ReforceXY/reward_space_analysis/reward_space_analysis.py
+++ b/ReforceXY/reward_space_analysis/reward_space_analysis.py
@@ -756,9 +756,15 @@ def add_tunable_cli_args(parser: argparse.ArgumentParser) -> None:
 
 @dataclasses.dataclass
 class RewardContext:
-    """Context for reward computation."""
+    """Context for reward computation.
 
-    pnl: float
+    Attributes
+    ----------
+    current_pnl : float
+        Unrealized PnL at the current tick (state s').
+    """
+
+    current_pnl: float
     trade_duration: int
     idle_duration: int
     max_unrealized_profit: float
@@ -1039,6 +1045,11 @@ def _compute_efficiency_coefficient(
         range_pnl = max_pnl - min_pnl
         if np.isfinite(range_pnl) and not np.isclose(range_pnl, 0.0):
             efficiency_ratio = (pnl - min_pnl) / range_pnl
+            # For profits (pnl > 0): high ratio = good exit â higher coefficient â amplify gain
+            # For losses (pnl < 0): high ratio = good exit â LOWER coefficient â attenuate penalty
+            # The sign inversion for losses ensures reward = pnl * coef behaves correctly:
+            #   - Good loss exit: small |pnl| * low coef = small penalty
+            #   - Bad loss exit: large |pnl| * high coef = large penalty
             if pnl > 0.0:
                 efficiency_coefficient = 1.0 + efficiency_weight * (
                     efficiency_ratio - efficiency_center
@@ -1145,9 +1156,15 @@ def _compute_exit_reward(
         float: Exit reward (pnl * exit_factor)
     """
     exit_factor = _get_exit_factor(
-        base_factor, context.pnl, pnl_target, duration_ratio, context, params, risk_reward_ratio
+        base_factor,
+        context.current_pnl,
+        pnl_target,
+        duration_ratio,
+        context,
+        params,
+        risk_reward_ratio,
     )
-    return context.pnl * exit_factor
+    return context.current_pnl * exit_factor
 
 
 def calculate_reward(
@@ -1225,7 +1242,7 @@ def calculate_reward(
     breakdown.base_reward = float(base_reward)
 
     # === PBRS INTEGRATION ===
-    current_pnl = context.pnl if context.position != Positions.Neutral else 0.0
+    current_pnl = context.current_pnl if context.position != Positions.Neutral else 0.0
 
     next_position = _get_next_position(
         context.position, context.action, short_allowed=short_allowed
@@ -1582,7 +1599,7 @@ def simulate_samples(
         )
 
         context = RewardContext(
-            pnl=pnl,
+            current_pnl=pnl,
             trade_duration=trade_duration,
             idle_duration=idle_duration,
             max_unrealized_profit=max_unrealized_profit,
@@ -1606,7 +1623,7 @@ def simulate_samples(
         idle_ratio = context.idle_duration / max(1, max_idle_duration_candles)
         samples.append(
             {
-                "pnl": context.pnl,
+                "pnl": context.current_pnl,
                 "trade_duration": context.trade_duration,
                 "idle_duration": context.idle_duration,
                 "duration_ratio": _compute_duration_ratio(
diff --git a/ReforceXY/reward_space_analysis/tests/components/test_reward_components.py b/ReforceXY/reward_space_analysis/tests/components/test_reward_components.py
index 496fdfa..f5410d6 100644
--- a/ReforceXY/reward_space_analysis/tests/components/test_reward_components.py
+++ b/ReforceXY/reward_space_analysis/tests/components/test_reward_components.py
@@ -18,7 +18,7 @@ from reward_space_analysis import (
     get_max_idle_duration_candles,
 )
 
-from ..constants import PARAMS, SCENARIOS, TOLERANCE
+from ..constants import EFFICIENCY, PARAMS, SCENARIOS, TOLERANCE
 from ..helpers import (
     RewardScenarioConfig,
     ThresholdTestConfig,
@@ -301,110 +301,189 @@ class TestRewardComponents(RewardSpaceTestBase):
             action=Actions.Long_exit,
         )
 
-        coefficient = _compute_efficiency_coefficient(params, ctx, ctx.pnl)
+        coefficient = _compute_efficiency_coefficient(params, ctx, ctx.current_pnl)
 
         self.assertFinite(coefficient, name="efficiency_coefficient")
         self.assertAlmostEqualFloat(coefficient, 1.0, tolerance=TOLERANCE.GENERIC_EQ)
 
-    def test_efficiency_coefficient_optimal_profit_exit(self):
-        """Efficiency coefficient rewards exits near peak unrealized profit.
-
-        Validates that exiting close to maximum unrealized profit produces
-        coefficient > 1.0, incentivizing optimal exit timing for profitable trades.
+    def test_efficiency_coefficient_profits_monotonic_with_exact_bounds(self):
+        """Verify efficiency coefficient monotonicity for profitable trades.
 
         **Setup:**
-        - PnL: 0.029 (very close to max_unrealized_profit=0.03)
-        - Efficiency ratio: (0.029 - 0.0) / (0.03 - 0.0) â 0.967 (high)
-        - efficiency_weight: 1.0, efficiency_center: 0.5
-        - Trade context: Long position exiting near peak
+        - efficiency_weight: EFFICIENCY.WEIGHT_DEFAULT (1.0)
+        - efficiency_center: EFFICIENCY.CENTER_DEFAULT (0.5)
+        - PnL range: EFFICIENCY.PNL_RANGE_PROFIT (6 test points)
+        - Unrealized range: [0.0, EFFICIENCY.MAX_UNREALIZED_PROFIT]
 
         **Assertions:**
-        - Coefficient is finite
-        - Coefficient > 1.0 (rewards optimal timing)
+        - Strict monotonicity (non-decreasing) as exit quality improves
+        - Exact coefficient values at bounds match formula: 1 + weight*(ratio - center)
+        - Poor exit: coefficient < 1.0, Optimal exit: coefficient > 1.0
         """
-        params = self.base_params(efficiency_weight=1.0, efficiency_center=0.5)
-        ctx = self.make_ctx(
-            pnl=0.029,  # Close to max
-            trade_duration=10,
-            max_unrealized_profit=0.03,
-            min_unrealized_profit=0.0,
-            position=Positions.Long,
-            action=Actions.Long_exit,
+        params = self.base_params(
+            efficiency_weight=EFFICIENCY.WEIGHT_DEFAULT,
+            efficiency_center=EFFICIENCY.CENTER_DEFAULT,
         )
+        max_unrealized_profit = EFFICIENCY.MAX_UNREALIZED_PROFIT
+        min_unrealized_profit = 0.0
 
-        coefficient = _compute_efficiency_coefficient(params, ctx, ctx.pnl)
+        pnl_values = list(EFFICIENCY.PNL_RANGE_PROFIT)
+        coefficients = []
 
-        self.assertFinite(coefficient, name="efficiency_coefficient")
-        self.assertGreater(
-            coefficient, 1.0, "Exit near max profit should reward with coefficient > 1.0"
+        for pnl in pnl_values:
+            ctx = self.make_ctx(
+                pnl=pnl,
+                trade_duration=EFFICIENCY.TRADE_DURATION_DEFAULT,
+                max_unrealized_profit=max_unrealized_profit,
+                min_unrealized_profit=min_unrealized_profit,
+                position=Positions.Long,
+                action=Actions.Long_exit,
+            )
+            coefficient = _compute_efficiency_coefficient(params, ctx, ctx.current_pnl)
+            self.assertFinite(coefficient, name=f"efficiency_coefficient[pnl={pnl}]")
+            coefficients.append(coefficient)
+
+        # Verify strict monotonicity
+        self.assertMonotonic(
+            coefficients,
+            non_decreasing=True,
+            tolerance=TOLERANCE.IDENTITY_STRICT,
+            name="efficiency_coefficient_for_profits",
         )
 
-    def test_efficiency_coefficient_poor_profit_exit(self):
-        """Efficiency coefficient penalizes exits far from peak unrealized profit.
+        # Verify exact values at bounds using the formula
+        # coefficient = 1.0 + weight * (ratio - center)
+        # ratio = (pnl - min_pnl) / range_pnl
+        range_pnl = max_unrealized_profit - min_unrealized_profit
+
+        # Poor exit bound (first element)
+        pnl_poor = pnl_values[0]
+        expected_ratio_poor = (pnl_poor - min_unrealized_profit) / range_pnl
+        expected_coef_poor = 1.0 + EFFICIENCY.WEIGHT_DEFAULT * (
+            expected_ratio_poor - EFFICIENCY.CENTER_DEFAULT
+        )
+        self.assertAlmostEqualFloat(
+            coefficients[0],
+            expected_coef_poor,
+            tolerance=TOLERANCE.GENERIC_EQ,
+            msg=f"Poor exit coefficient {coefficients[0]:.4f} != expected {expected_coef_poor:.4f}",
+        )
+        self.assertLess(coefficients[0], 1.0, "Poor profit exit should have coefficient < 1.0")
 
-        Validates that exiting far below maximum unrealized profit produces
-        coefficient < 1.0, penalizing poor exit timing that leaves profit on the table.
+        # Optimal exit bound (last element)
+        pnl_optimal = pnl_values[-1]
+        expected_ratio_optimal = (pnl_optimal - min_unrealized_profit) / range_pnl
+        expected_coef_optimal = 1.0 + EFFICIENCY.WEIGHT_DEFAULT * (
+            expected_ratio_optimal - EFFICIENCY.CENTER_DEFAULT
+        )
+        self.assertAlmostEqualFloat(
+            coefficients[-1],
+            expected_coef_optimal,
+            tolerance=TOLERANCE.GENERIC_EQ,
+            msg=f"Optimal exit coefficient {coefficients[-1]:.4f} != expected {expected_coef_optimal:.4f}",
+        )
+        self.assertGreater(
+            coefficients[-1], 1.0, "Optimal profit exit should have coefficient > 1.0"
+        )
+
+    def test_efficiency_coefficient_losses_monotonic_with_exact_bounds(self):
+        """Verify efficiency coefficient behavior for losing trades.
 
         **Setup:**
-        - PnL: 0.005 (far from max_unrealized_profit=0.03)
-        - Efficiency ratio: (0.005 - 0.0) / (0.03 - 0.0) â 0.167 (low)
-        - efficiency_weight: 1.0, efficiency_center: 0.5
-        - Trade context: Long position exiting prematurely
+        - efficiency_weight: EFFICIENCY.WEIGHT_DEFAULT (1.0)
+        - efficiency_center: EFFICIENCY.CENTER_DEFAULT (0.5)
+        - PnL range: EFFICIENCY.PNL_RANGE_LOSS (7 test points, worst to best)
+        - Unrealized range: [EFFICIENCY.MIN_UNREALIZED_PROFIT, 0.0]
 
         **Assertions:**
-        - Coefficient is finite
-        - Coefficient < 1.0 (penalizes suboptimal timing)
+        - Coefficient DECREASES as exit quality improves (inverted formula)
+        - Exact values at bounds match: 1 + weight*(center - ratio)
+        - Reward (pnl * coef) is less negative for better exits
         """
-        params = self.base_params(efficiency_weight=1.0, efficiency_center=0.5)
-        ctx = self.make_ctx(
-            pnl=0.005,  # Far from max
-            trade_duration=10,
-            max_unrealized_profit=0.03,
-            min_unrealized_profit=0.0,
-            position=Positions.Long,
-            action=Actions.Long_exit,
-        )
-
-        coefficient = _compute_efficiency_coefficient(params, ctx, ctx.pnl)
-
-        self.assertFinite(coefficient, name="efficiency_coefficient")
-        self.assertLess(
-            coefficient, 1.0, "Exit far from max profit should penalize with coefficient < 1.0"
+        params = self.base_params(
+            efficiency_weight=EFFICIENCY.WEIGHT_DEFAULT,
+            efficiency_center=EFFICIENCY.CENTER_DEFAULT,
         )
+        max_unrealized_profit = 0.0
+        min_unrealized_profit = EFFICIENCY.MIN_UNREALIZED_PROFIT
 
-    def test_efficiency_coefficient_optimal_loss_exit(self):
-        """Efficiency coefficient rewards loss exits near minimum unrealized loss.
+        pnl_values = list(EFFICIENCY.PNL_RANGE_LOSS)
+        coefficients = []
+        rewards = []
 
-        Validates that exiting close to minimum unrealized loss produces
-        coefficient > 1.0, rewarding quick loss-cutting behavior for losing trades.
+        for pnl in pnl_values:
+            ctx = self.make_ctx(
+                pnl=pnl,
+                trade_duration=EFFICIENCY.TRADE_DURATION_DEFAULT,
+                max_unrealized_profit=max_unrealized_profit,
+                min_unrealized_profit=min_unrealized_profit,
+                position=Positions.Long,
+                action=Actions.Long_exit,
+            )
+            coefficient = _compute_efficiency_coefficient(params, ctx, ctx.current_pnl)
+            self.assertFinite(coefficient, name=f"efficiency_coefficient[pnl={pnl}]")
+            coefficients.append(coefficient)
+            # Simplified reward calculation (ignoring other factors for this test)
+            rewards.append(pnl * coefficient)
 
-        **Setup:**
-        - PnL: -0.005 (very close to min_unrealized_profit=-0.006)
-        - Efficiency ratio: (-0.005 - (-0.006)) / (0.0 - (-0.006)) â 0.167 (low)
-        - For losses: coefficient = 1 + weight * (center - ratio) â rewards low ratio
-        - efficiency_weight: 1.0, efficiency_center: 0.5
-        - Trade context: Long position cutting losses quickly
+        # Verify coefficient DECREASES as exit quality improves (monotonically decreasing)
+        self.assertMonotonic(
+            coefficients,
+            non_increasing=True,  # Decreasing for losses!
+            tolerance=TOLERANCE.IDENTITY_STRICT,
+            name="efficiency_coefficient_for_losses",
+        )
 
-        **Assertions:**
-        - Coefficient is finite
-        - Coefficient > 1.0 (rewards optimal loss exit)
-        """
-        params = self.base_params(efficiency_weight=1.0, efficiency_center=0.5)
-        ctx = self.make_ctx(
-            pnl=-0.005,  # Close to min loss
-            trade_duration=10,
-            max_unrealized_profit=0.0,
-            min_unrealized_profit=-0.006,
-            position=Positions.Long,
-            action=Actions.Long_exit,
+        # Verify reward INCREASES (less negative) as exit quality improves
+        self.assertMonotonic(
+            rewards,
+            non_decreasing=True,
+            tolerance=TOLERANCE.IDENTITY_STRICT,
+            name="exit_reward_for_losses",
         )
 
-        coefficient = _compute_efficiency_coefficient(params, ctx, ctx.pnl)
+        # Verify exact values at bounds using the INVERTED formula for losses
+        # coefficient = 1.0 + weight * (center - ratio)
+        range_pnl = max_unrealized_profit - min_unrealized_profit
 
-        self.assertFinite(coefficient, name="efficiency_coefficient")
+        # Worst exit bound (first element: largest loss)
+        pnl_worst = pnl_values[0]
+        ratio_worst = (pnl_worst - min_unrealized_profit) / range_pnl
+        expected_coef_worst = 1.0 + EFFICIENCY.WEIGHT_DEFAULT * (
+            EFFICIENCY.CENTER_DEFAULT - ratio_worst
+        )
+        self.assertAlmostEqualFloat(
+            coefficients[0],
+            expected_coef_worst,
+            tolerance=TOLERANCE.GENERIC_EQ,
+            msg=f"Worst loss coefficient {coefficients[0]:.4f} != expected {expected_coef_worst:.4f}",
+        )
         self.assertGreater(
-            coefficient, 1.0, "Exit near min loss should reward with coefficient > 1.0"
+            coefficients[0],
+            1.0,
+            "Worst loss exit should have coefficient > 1.0 (amplifies penalty)",
+        )
+
+        # Optimal exit bound (last element: minimal loss)
+        pnl_optimal = pnl_values[-1]
+        ratio_optimal = (pnl_optimal - min_unrealized_profit) / range_pnl
+        expected_coef_optimal = 1.0 + EFFICIENCY.WEIGHT_DEFAULT * (
+            EFFICIENCY.CENTER_DEFAULT - ratio_optimal
+        )
+        self.assertAlmostEqualFloat(
+            coefficients[-1],
+            expected_coef_optimal,
+            tolerance=TOLERANCE.GENERIC_EQ,
+            msg=f"Minimal loss coefficient {coefficients[-1]:.4f} != expected {expected_coef_optimal:.4f}",
         )
+        self.assertLess(
+            coefficients[-1],
+            1.0,
+            "Minimal loss exit should have coefficient < 1.0 (attenuates penalty)",
+        )
+
+        # Verify the final reward semantics: better exit = less negative reward
+        self.assertLess(rewards[0], rewards[-1], "Worst exit should have more negative reward")
 
     def test_exit_reward_never_positive_for_loss_due_to_efficiency(self):
         """Exit reward should not become positive for a loss trade.
diff --git a/ReforceXY/reward_space_analysis/tests/constants.py b/ReforceXY/reward_space_analysis/tests/constants.py
index 8db775b..ba564e7 100644
--- a/ReforceXY/reward_space_analysis/tests/constants.py
+++ b/ReforceXY/reward_space_analysis/tests/constants.py
@@ -100,6 +100,40 @@ class ExitFactorConfig:
     MIN_POWER_TAU: float = 1e-15
 
 
+@dataclass(frozen=True)
+class EfficiencyConfig:
+    """Efficiency coefficient testing configuration.
+
+    Configuration for exit timing efficiency coefficient validation, including
+    the formula parameters and standard test values.
+
+    The efficiency coefficient modifies exit rewards based on how well the agent
+    timed its exit relative to unrealized PnL extremes during the trade.
+
+    Formula:
+        For profits: coefficient = 1.0 + weight * (ratio - center)
+        For losses:  coefficient = 1.0 + weight * (center - ratio)  [inverted]
+        Where: ratio = (pnl - min_unrealized) / (max_unrealized - min_unrealized)
+
+    Attributes:
+        WEIGHT_DEFAULT: Default efficiency_weight parameter (1.0)
+        CENTER_DEFAULT: Default efficiency_center parameter (0.5)
+        MAX_UNREALIZED_PROFIT: Standard max unrealized profit for profit tests (0.03)
+        MIN_UNREALIZED_PROFIT: Standard min unrealized profit for loss tests (-0.03)
+        PNL_RANGE_PROFIT: Standard PnL range for profit tests: (min, max) tuple
+        PNL_RANGE_LOSS: Standard PnL range for loss tests: (min, max) tuple
+        TRADE_DURATION_DEFAULT: Default trade duration for efficiency tests (10)
+    """
+
+    WEIGHT_DEFAULT: float = 1.0
+    CENTER_DEFAULT: float = 0.5
+    MAX_UNREALIZED_PROFIT: float = 0.03
+    MIN_UNREALIZED_PROFIT: float = -0.03
+    PNL_RANGE_PROFIT: tuple[float, ...] = (0.005, 0.010, 0.015, 0.020, 0.025, 0.029)
+    PNL_RANGE_LOSS: tuple[float, ...] = (-0.029, -0.025, -0.020, -0.015, -0.010, -0.005, -0.001)
+    TRADE_DURATION_DEFAULT: int = 10
+
+
 @dataclass(frozen=True)
 class PBRSConfig:
     """Potential-Based Reward Shaping (PBRS) configuration.
@@ -398,6 +432,7 @@ class StatisticalTolerances:
 # Global singleton instances for easy import
 TOLERANCE: Final[ToleranceConfig] = ToleranceConfig()
 CONTINUITY: Final[ContinuityConfig] = ContinuityConfig()
+EFFICIENCY: Final[EfficiencyConfig] = EfficiencyConfig()
 EXIT_FACTOR: Final[ExitFactorConfig] = ExitFactorConfig()
 PBRS: Final[PBRSConfig] = PBRSConfig()
 STATISTICAL: Final[StatisticalConfig] = StatisticalConfig()
@@ -409,6 +444,7 @@ STAT_TOL: Final[StatisticalTolerances] = StatisticalTolerances()
 
 __all__ = [
     "CONTINUITY",
+    "EFFICIENCY",
     "EXIT_FACTOR",
     "PARAMS",
     "PBRS",
@@ -418,6 +454,7 @@ __all__ = [
     "STAT_TOL",
     "TOLERANCE",
     "ContinuityConfig",
+    "EfficiencyConfig",
     "ExitFactorConfig",
     "PBRSConfig",
     "StatisticalConfig",
diff --git a/ReforceXY/reward_space_analysis/tests/helpers/assertions.py b/ReforceXY/reward_space_analysis/tests/helpers/assertions.py
index 76b6cc1..25c1b49 100644
--- a/ReforceXY/reward_space_analysis/tests/helpers/assertions.py
+++ b/ReforceXY/reward_space_analysis/tests/helpers/assertions.py
@@ -666,13 +666,13 @@ def assert_exit_mode_mathematical_validation(
     )
     pnl_target = profit_aim * risk_reward_ratio
     pnl_target_coefficient = _compute_pnl_target_coefficient(
-        params, context.pnl, pnl_target, risk_reward_ratio
+        params, context.current_pnl, pnl_target, risk_reward_ratio
     )
-    efficiency_coefficient = _compute_efficiency_coefficient(params, context, context.pnl)
+    efficiency_coefficient = _compute_efficiency_coefficient(params, context, context.current_pnl)
 
     observed_exit_factor = _get_exit_factor(
         base_factor,
-        context.pnl,
+        context.current_pnl,
         pnl_target,
         duration_ratio,
         context,
diff --git a/ReforceXY/reward_space_analysis/tests/pbrs/test_pbrs.py b/ReforceXY/reward_space_analysis/tests/pbrs/test_pbrs.py
index 85dc8e2..cbbc3e5 100644
--- a/ReforceXY/reward_space_analysis/tests/pbrs/test_pbrs.py
+++ b/ReforceXY/reward_space_analysis/tests/pbrs/test_pbrs.py
@@ -748,7 +748,7 @@ class TestPBRS(RewardSpaceTestBase):
         )
 
         expected_next_potential = _compute_hold_potential(
-            pnl=ctx.pnl,
+            pnl=ctx.current_pnl,
             pnl_target=PARAMS.PROFIT_AIM * PARAMS.RISK_REWARD_RATIO,
             duration_ratio=(trade_duration / max_trade_duration_candles),
             risk_reward_ratio=PARAMS.RISK_REWARD_RATIO,
@@ -958,7 +958,7 @@ class TestPBRS(RewardSpaceTestBase):
 
         current_duration_ratio = ctx.trade_duration / params["max_trade_duration_candles"]
         prev_potential = _compute_hold_potential(
-            ctx.pnl,
+            ctx.current_pnl,
             pnl_target,
             current_duration_ratio,
             PARAMS.RISK_REWARD_RATIO,
diff --git a/ReforceXY/reward_space_analysis/tests/statistics/test_statistics.py b/ReforceXY/reward_space_analysis/tests/statistics/test_statistics.py
index 7487bfb..e0c1db2 100644
--- a/ReforceXY/reward_space_analysis/tests/statistics/test_statistics.py
+++ b/ReforceXY/reward_space_analysis/tests/statistics/test_statistics.py
@@ -171,8 +171,8 @@ class TestStatistics(RewardSpaceTestBase):
         if len(df) > 30:
             idle_data = df[df["idle_duration"] > 0]
             if len(idle_data) > 10:
-                idle_dur = np.asarray(idle_data["idle_duration"], dtype=float)
-                idle_rew = np.asarray(idle_data["reward_idle"], dtype=float)
+                idle_dur = idle_data["idle_duration"].to_numpy(dtype=float)
+                idle_rew = idle_data["reward_idle"].to_numpy(dtype=float)
                 self.assertTrue(
                     len(idle_dur) == len(idle_rew),
                     "Idle duration and reward arrays should have same length",
diff --git a/ReforceXY/reward_space_analysis/tests/test_base.py b/ReforceXY/reward_space_analysis/tests/test_base.py
index 86f3fd0..4bb2698 100644
--- a/ReforceXY/reward_space_analysis/tests/test_base.py
+++ b/ReforceXY/reward_space_analysis/tests/test_base.py
@@ -45,7 +45,7 @@ def make_ctx(
 ) -> RewardContext:
     """Create a RewardContext with neutral defaults."""
     return RewardContext(
-        pnl=pnl,
+        current_pnl=pnl,
         trade_duration=trade_duration,
         idle_duration=idle_duration,
         max_unrealized_profit=max_unrealized_profit,
diff --git a/ReforceXY/user_data/freqaimodels/ReforceXY.py b/ReforceXY/user_data/freqaimodels/ReforceXY.py
index 8974741..ebc7afd 100644
--- a/ReforceXY/user_data/freqaimodels/ReforceXY.py
+++ b/ReforceXY/user_data/freqaimodels/ReforceXY.py
@@ -1372,13 +1372,13 @@ class ReforceXY(BaseReinforcementLearningModel):
             )
             hyperopt_failed = True
         time_spent = time.time() - start_time
-        n_complete = len([t for t in study.trials if t.state == TrialState.COMPLETE])
+        n_completed = len([t for t in study.trials if t.state == TrialState.COMPLETE])
         n_pruned = len([t for t in study.trials if t.state == TrialState.PRUNED])
         n_failed = len([t for t in study.trials if t.state == TrialState.FAIL])
         logger.info(
-            "Hyperopt %s: %s complete, %s pruned, %s failed trials",
+            "Hyperopt %s: %s completed, %s pruned, %s failed trials",
             study_name,
-            n_complete,
+            n_completed,
             n_pruned,
             n_failed,
         )
@@ -1975,31 +1975,46 @@ class MyRLEnv(Base5ActionRLEnv):
         self,
         action: int,
         trade_duration: float,
-        pnl: float,
+        current_pnl: float,
     ) -> Tuple[Positions, int, float]:
-        """Compute next transition state tuple."""
+        """Compute next transition state tuple (next_position, next_duration, next_pnl).
+
+        Parameters
+        ----------
+        action : int
+            Action taken by the agent.
+        trade_duration : float
+            Trade duration at current tick.
+        current_pnl : float
+            Unrealized PnL at current tick.
+
+        Returns
+        -------
+        tuple[Positions, int, float]
+            (next_position, next_trade_duration, next_pnl) for the transition s -> s'.
+        """
         next_position = self._get_next_position(action)
 
-        # Entry
+        # Entry: Neutral -> Long/Short
         if self._position == Positions.Neutral and next_position in (
             Positions.Long,
             Positions.Short,
         ):
             return next_position, 0, self._get_entry_unrealized_profit(next_position)
 
-        # Exit
+        # Exit: Long/Short -> Neutral
         if (
             self._position in (Positions.Long, Positions.Short)
             and next_position == Positions.Neutral
         ):
             return next_position, 0, 0.0
 
-        # Hold
+        # Hold: Long/Short -> Long/Short
         if self._position in (Positions.Long, Positions.Short) and next_position in (
             Positions.Long,
             Positions.Short,
         ):
-            return next_position, int(trade_duration), pnl
+            return next_position, int(trade_duration), current_pnl
 
         # Neutral self-loop
         return next_position, 0, 0.0
@@ -2258,7 +2273,7 @@ class MyRLEnv(Base5ActionRLEnv):
         action: int,
         trade_duration: float,
         max_trade_duration: float,
-        pnl: float,
+        current_pnl: float,
         pnl_target: float,
         hold_potential_scale: float,
         entry_additive_scale: float,
@@ -2395,11 +2410,13 @@ class MyRLEnv(Base5ActionRLEnv):
         action : int
             Action taken: determines transition type (entry/hold/exit)
         trade_duration : float
-            Current trade duration in candles (for current state s)
+            Trade duration at current tick.
+            This is the duration for state s'.
         max_trade_duration : float
             Maximum allowed trade duration (for normalization)
-        pnl : float
-            Current position PnL (for current state s)
+        current_pnl : float
+            Unrealized PnL at current tick.
+            This is the PnL for state s'.
         pnl_target : float
             Target PnL for ratio normalization: r_pnl = pnl / pnl_target
         hold_potential_scale : float
@@ -2449,7 +2466,7 @@ class MyRLEnv(Base5ActionRLEnv):
             return 0.0, 0.0, 0.0
 
         next_position, next_trade_duration, next_pnl = self._get_next_transition_state(
-            action=action, trade_duration=trade_duration, pnl=pnl
+            action=action, trade_duration=trade_duration, current_pnl=current_pnl
         )
         if max_trade_duration <= 0:
             next_duration_ratio = 0.0
@@ -2520,7 +2537,7 @@ class MyRLEnv(Base5ActionRLEnv):
             if self._exit_additive_enabled and not self.is_pbrs_invariant_mode():
                 duration_ratio = trade_duration / max(1, max_trade_duration)
                 exit_additive = self._compute_exit_additive(
-                    pnl, pnl_target, duration_ratio, exit_additive_scale
+                    current_pnl, pnl_target, duration_ratio, exit_additive_scale
                 )
                 self._total_exit_additive += float(exit_additive)
 
@@ -2988,7 +3005,7 @@ class MyRLEnv(Base5ActionRLEnv):
             action=action,
             trade_duration=trade_duration,
             max_trade_duration=max_trade_duration,
-            pnl=pnl,
+            current_pnl=pnl,
             pnl_target=self._pnl_target,
             hold_potential_scale=hold_potential_scale,
             entry_additive_scale=entry_additive_scale,