refactor(ReforceXY): improve code readability and maintainability

author Jérôme Benoit <jerome.benoit@piment-noir.org>

Sat, 27 Dec 2025 01:12:16 +0000 (02:12 +0100)

committer Jérôme Benoit <jerome.benoit@piment-noir.org>

Sat, 27 Dec 2025 01:12:16 +0000 (02:12 +0100)
author Jérôme Benoit <jerome.benoit@piment-noir.org>
Sat, 27 Dec 2025 01:12:16 +0000 (02:12 +0100)
committer Jérôme Benoit <jerome.benoit@piment-noir.org>
Sat, 27 Dec 2025 01:12:16 +0000 (02:12 +0100)
diff --git a/ReforceXY/reward_space_analysis/reward_space_analysis.py b/ReforceXY/reward_space_analysis/reward_space_analysis.py

index 3cc3b113974e22256f36a61879250b43e9dbb00d..94a9d5503eb2c7057ca9602b13dccf979ad06c09 100644 (file)
--- a/ReforceXY/reward_space_analysis/reward_space_analysis.py
+++ b/ReforceXY/reward_space_analysis/reward_space_analysis.py
@@ -756,9 +756,15 @@ def add_tunable_cli_args(parser: argparse.ArgumentParser) -> None:
  
  @dataclasses.dataclass
  class RewardContext:
-    """Context for reward computation."""
+    """Context for reward computation.
  
-    pnl: float
+    Attributes
+    ----------
+    current_pnl : float
+        Unrealized PnL at the current tick (state s').
+    """
+
+    current_pnl: float
      trade_duration: int
      idle_duration: int
      max_unrealized_profit: float
@@ -1039,6 +1045,11 @@ def _compute_efficiency_coefficient(
          range_pnl = max_pnl - min_pnl
          if np.isfinite(range_pnl) and not np.isclose(range_pnl, 0.0):
              efficiency_ratio = (pnl - min_pnl) / range_pnl
+            # For profits (pnl > 0): high ratio = good exit → higher coefficient → amplify gain
+            # For losses (pnl < 0): high ratio = good exit → LOWER coefficient → attenuate penalty
+            # The sign inversion for losses ensures reward = pnl * coef behaves correctly:
+            #   - Good loss exit: small |pnl| * low coef = small penalty
+            #   - Bad loss exit: large |pnl| * high coef = large penalty
              if pnl > 0.0:
                  efficiency_coefficient = 1.0 + efficiency_weight * (
                      efficiency_ratio - efficiency_center
@@ -1145,9 +1156,15 @@ def _compute_exit_reward(
          float: Exit reward (pnl * exit_factor)
      """
      exit_factor = _get_exit_factor(
-        base_factor, context.pnl, pnl_target, duration_ratio, context, params, risk_reward_ratio
+        base_factor,
+        context.current_pnl,
+        pnl_target,
+        duration_ratio,
+        context,
+        params,
+        risk_reward_ratio,
      )
-    return context.pnl * exit_factor
+    return context.current_pnl * exit_factor
  
  
  def calculate_reward(
@@ -1225,7 +1242,7 @@ def calculate_reward(
      breakdown.base_reward = float(base_reward)
  
      # === PBRS INTEGRATION ===
-    current_pnl = context.pnl if context.position != Positions.Neutral else 0.0
+    current_pnl = context.current_pnl if context.position != Positions.Neutral else 0.0
  
      next_position = _get_next_position(
          context.position, context.action, short_allowed=short_allowed
@@ -1582,7 +1599,7 @@ def simulate_samples(
          )
  
          context = RewardContext(
-            pnl=pnl,
+            current_pnl=pnl,
              trade_duration=trade_duration,
              idle_duration=idle_duration,
              max_unrealized_profit=max_unrealized_profit,
@@ -1606,7 +1623,7 @@ def simulate_samples(
          idle_ratio = context.idle_duration / max(1, max_idle_duration_candles)
          samples.append(
              {
-                "pnl": context.pnl,
+                "pnl": context.current_pnl,
                  "trade_duration": context.trade_duration,
                  "idle_duration": context.idle_duration,
                  "duration_ratio": _compute_duration_ratio(
diff --git a/ReforceXY/reward_space_analysis/tests/components/test_reward_components.py b/ReforceXY/reward_space_analysis/tests/components/test_reward_components.py

index 496fdfa49836741747d610725da0df87fcbf7558..f5410d6fad9b2b568a26656418e3b06f32deb542 100644 (file)
--- a/ReforceXY/reward_space_analysis/tests/components/test_reward_components.py
+++ b/ReforceXY/reward_space_analysis/tests/components/test_reward_components.py
@@ -18,7 +18,7 @@ from reward_space_analysis import (
      get_max_idle_duration_candles,
  )
  
-from ..constants import PARAMS, SCENARIOS, TOLERANCE
+from ..constants import EFFICIENCY, PARAMS, SCENARIOS, TOLERANCE
  from ..helpers import (
      RewardScenarioConfig,
      ThresholdTestConfig,
@@ -301,110 +301,189 @@ class TestRewardComponents(RewardSpaceTestBase):
              action=Actions.Long_exit,
          )
  
-        coefficient = _compute_efficiency_coefficient(params, ctx, ctx.pnl)
+        coefficient = _compute_efficiency_coefficient(params, ctx, ctx.current_pnl)
  
          self.assertFinite(coefficient, name="efficiency_coefficient")
          self.assertAlmostEqualFloat(coefficient, 1.0, tolerance=TOLERANCE.GENERIC_EQ)
  
-    def test_efficiency_coefficient_optimal_profit_exit(self):
-        """Efficiency coefficient rewards exits near peak unrealized profit.
-
-        Validates that exiting close to maximum unrealized profit produces
-        coefficient > 1.0, incentivizing optimal exit timing for profitable trades.
+    def test_efficiency_coefficient_profits_monotonic_with_exact_bounds(self):
+        """Verify efficiency coefficient monotonicity for profitable trades.
  
          **Setup:**
-        - PnL: 0.029 (very close to max_unrealized_profit=0.03)
-        - Efficiency ratio: (0.029 - 0.0) / (0.03 - 0.0) ≈ 0.967 (high)
-        - efficiency_weight: 1.0, efficiency_center: 0.5
-        - Trade context: Long position exiting near peak
+        - efficiency_weight: EFFICIENCY.WEIGHT_DEFAULT (1.0)
+        - efficiency_center: EFFICIENCY.CENTER_DEFAULT (0.5)
+        - PnL range: EFFICIENCY.PNL_RANGE_PROFIT (6 test points)
+        - Unrealized range: [0.0, EFFICIENCY.MAX_UNREALIZED_PROFIT]
  
          **Assertions:**
-        - Coefficient is finite
-        - Coefficient > 1.0 (rewards optimal timing)
+        - Strict monotonicity (non-decreasing) as exit quality improves
+        - Exact coefficient values at bounds match formula: 1 + weight*(ratio - center)
+        - Poor exit: coefficient < 1.0, Optimal exit: coefficient > 1.0
          """
-        params = self.base_params(efficiency_weight=1.0, efficiency_center=0.5)
-        ctx = self.make_ctx(
-            pnl=0.029,  # Close to max
-            trade_duration=10,
-            max_unrealized_profit=0.03,
-            min_unrealized_profit=0.0,
-            position=Positions.Long,
-            action=Actions.Long_exit,
+        params = self.base_params(
+            efficiency_weight=EFFICIENCY.WEIGHT_DEFAULT,
+            efficiency_center=EFFICIENCY.CENTER_DEFAULT,
          )
+        max_unrealized_profit = EFFICIENCY.MAX_UNREALIZED_PROFIT
+        min_unrealized_profit = 0.0
  
-        coefficient = _compute_efficiency_coefficient(params, ctx, ctx.pnl)
+        pnl_values = list(EFFICIENCY.PNL_RANGE_PROFIT)
+        coefficients = []
  
-        self.assertFinite(coefficient, name="efficiency_coefficient")
-        self.assertGreater(
-            coefficient, 1.0, "Exit near max profit should reward with coefficient > 1.0"
+        for pnl in pnl_values:
+            ctx = self.make_ctx(
+                pnl=pnl,
+                trade_duration=EFFICIENCY.TRADE_DURATION_DEFAULT,
+                max_unrealized_profit=max_unrealized_profit,
+                min_unrealized_profit=min_unrealized_profit,
+                position=Positions.Long,
+                action=Actions.Long_exit,
+            )
+            coefficient = _compute_efficiency_coefficient(params, ctx, ctx.current_pnl)
+            self.assertFinite(coefficient, name=f"efficiency_coefficient[pnl={pnl}]")
+            coefficients.append(coefficient)
+
+        # Verify strict monotonicity
+        self.assertMonotonic(
+            coefficients,
+            non_decreasing=True,
+            tolerance=TOLERANCE.IDENTITY_STRICT,
+            name="efficiency_coefficient_for_profits",
          )
  
-    def test_efficiency_coefficient_poor_profit_exit(self):
-        """Efficiency coefficient penalizes exits far from peak unrealized profit.
+        # Verify exact values at bounds using the formula
+        # coefficient = 1.0 + weight * (ratio - center)
+        # ratio = (pnl - min_pnl) / range_pnl
+        range_pnl = max_unrealized_profit - min_unrealized_profit
+
+        # Poor exit bound (first element)
+        pnl_poor = pnl_values[0]
+        expected_ratio_poor = (pnl_poor - min_unrealized_profit) / range_pnl
+        expected_coef_poor = 1.0 + EFFICIENCY.WEIGHT_DEFAULT * (
+            expected_ratio_poor - EFFICIENCY.CENTER_DEFAULT
+        )
+        self.assertAlmostEqualFloat(
+            coefficients[0],
+            expected_coef_poor,
+            tolerance=TOLERANCE.GENERIC_EQ,
+            msg=f"Poor exit coefficient {coefficients[0]:.4f} != expected {expected_coef_poor:.4f}",
+        )
+        self.assertLess(coefficients[0], 1.0, "Poor profit exit should have coefficient < 1.0")
  
-        Validates that exiting far below maximum unrealized profit produces
-        coefficient < 1.0, penalizing poor exit timing that leaves profit on the table.
+        # Optimal exit bound (last element)
+        pnl_optimal = pnl_values[-1]
+        expected_ratio_optimal = (pnl_optimal - min_unrealized_profit) / range_pnl
+        expected_coef_optimal = 1.0 + EFFICIENCY.WEIGHT_DEFAULT * (
+            expected_ratio_optimal - EFFICIENCY.CENTER_DEFAULT
+        )
+        self.assertAlmostEqualFloat(
+            coefficients[-1],
+            expected_coef_optimal,
+            tolerance=TOLERANCE.GENERIC_EQ,
+            msg=f"Optimal exit coefficient {coefficients[-1]:.4f} != expected {expected_coef_optimal:.4f}",
+        )
+        self.assertGreater(
+            coefficients[-1], 1.0, "Optimal profit exit should have coefficient > 1.0"
+        )
+
+    def test_efficiency_coefficient_losses_monotonic_with_exact_bounds(self):
+        """Verify efficiency coefficient behavior for losing trades.
  
          **Setup:**
-        - PnL: 0.005 (far from max_unrealized_profit=0.03)
-        - Efficiency ratio: (0.005 - 0.0) / (0.03 - 0.0) ≈ 0.167 (low)
-        - efficiency_weight: 1.0, efficiency_center: 0.5
-        - Trade context: Long position exiting prematurely
+        - efficiency_weight: EFFICIENCY.WEIGHT_DEFAULT (1.0)
+        - efficiency_center: EFFICIENCY.CENTER_DEFAULT (0.5)
+        - PnL range: EFFICIENCY.PNL_RANGE_LOSS (7 test points, worst to best)
+        - Unrealized range: [EFFICIENCY.MIN_UNREALIZED_PROFIT, 0.0]
  
          **Assertions:**
-        - Coefficient is finite
-        - Coefficient < 1.0 (penalizes suboptimal timing)
+        - Coefficient DECREASES as exit quality improves (inverted formula)
+        - Exact values at bounds match: 1 + weight*(center - ratio)
+        - Reward (pnl * coef) is less negative for better exits
          """
-        params = self.base_params(efficiency_weight=1.0, efficiency_center=0.5)
-        ctx = self.make_ctx(
-            pnl=0.005,  # Far from max
-            trade_duration=10,
-            max_unrealized_profit=0.03,
-            min_unrealized_profit=0.0,
-            position=Positions.Long,
-            action=Actions.Long_exit,
-        )
-
-        coefficient = _compute_efficiency_coefficient(params, ctx, ctx.pnl)
-
-        self.assertFinite(coefficient, name="efficiency_coefficient")
-        self.assertLess(
-            coefficient, 1.0, "Exit far from max profit should penalize with coefficient < 1.0"
+        params = self.base_params(
+            efficiency_weight=EFFICIENCY.WEIGHT_DEFAULT,
+            efficiency_center=EFFICIENCY.CENTER_DEFAULT,
          )
+        max_unrealized_profit = 0.0
+        min_unrealized_profit = EFFICIENCY.MIN_UNREALIZED_PROFIT
  
-    def test_efficiency_coefficient_optimal_loss_exit(self):
-        """Efficiency coefficient rewards loss exits near minimum unrealized loss.
+        pnl_values = list(EFFICIENCY.PNL_RANGE_LOSS)
+        coefficients = []
+        rewards = []
  
-        Validates that exiting close to minimum unrealized loss produces
-        coefficient > 1.0, rewarding quick loss-cutting behavior for losing trades.
+        for pnl in pnl_values:
+            ctx = self.make_ctx(
+                pnl=pnl,
+                trade_duration=EFFICIENCY.TRADE_DURATION_DEFAULT,
+                max_unrealized_profit=max_unrealized_profit,
+                min_unrealized_profit=min_unrealized_profit,
+                position=Positions.Long,
+                action=Actions.Long_exit,
+            )
+            coefficient = _compute_efficiency_coefficient(params, ctx, ctx.current_pnl)
+            self.assertFinite(coefficient, name=f"efficiency_coefficient[pnl={pnl}]")
+            coefficients.append(coefficient)
+            # Simplified reward calculation (ignoring other factors for this test)
+            rewards.append(pnl * coefficient)
  
-        **Setup:**
-        - PnL: -0.005 (very close to min_unrealized_profit=-0.006)
-        - Efficiency ratio: (-0.005 - (-0.006)) / (0.0 - (-0.006)) ≈ 0.167 (low)
-        - For losses: coefficient = 1 + weight * (center - ratio) → rewards low ratio
-        - efficiency_weight: 1.0, efficiency_center: 0.5
-        - Trade context: Long position cutting losses quickly
+        # Verify coefficient DECREASES as exit quality improves (monotonically decreasing)
+        self.assertMonotonic(
+            coefficients,
+            non_increasing=True,  # Decreasing for losses!
+            tolerance=TOLERANCE.IDENTITY_STRICT,
+            name="efficiency_coefficient_for_losses",
+        )
  
-        **Assertions:**
-        - Coefficient is finite
-        - Coefficient > 1.0 (rewards optimal loss exit)
-        """
-        params = self.base_params(efficiency_weight=1.0, efficiency_center=0.5)
-        ctx = self.make_ctx(
-            pnl=-0.005,  # Close to min loss
-            trade_duration=10,
-            max_unrealized_profit=0.0,
-            min_unrealized_profit=-0.006,
-            position=Positions.Long,
-            action=Actions.Long_exit,
+        # Verify reward INCREASES (less negative) as exit quality improves
+        self.assertMonotonic(
+            rewards,
+            non_decreasing=True,
+            tolerance=TOLERANCE.IDENTITY_STRICT,
+            name="exit_reward_for_losses",
          )
  
-        coefficient = _compute_efficiency_coefficient(params, ctx, ctx.pnl)
+        # Verify exact values at bounds using the INVERTED formula for losses
+        # coefficient = 1.0 + weight * (center - ratio)
+        range_pnl = max_unrealized_profit - min_unrealized_profit
  
-        self.assertFinite(coefficient, name="efficiency_coefficient")
+        # Worst exit bound (first element: largest loss)
+        pnl_worst = pnl_values[0]
+        ratio_worst = (pnl_worst - min_unrealized_profit) / range_pnl
+        expected_coef_worst = 1.0 + EFFICIENCY.WEIGHT_DEFAULT * (
+            EFFICIENCY.CENTER_DEFAULT - ratio_worst
+        )
+        self.assertAlmostEqualFloat(
+            coefficients[0],
+            expected_coef_worst,
+            tolerance=TOLERANCE.GENERIC_EQ,
+            msg=f"Worst loss coefficient {coefficients[0]:.4f} != expected {expected_coef_worst:.4f}",
+        )
          self.assertGreater(
-            coefficient, 1.0, "Exit near min loss should reward with coefficient > 1.0"
+            coefficients[0],
+            1.0,
+            "Worst loss exit should have coefficient > 1.0 (amplifies penalty)",
+        )
+
+        # Optimal exit bound (last element: minimal loss)
+        pnl_optimal = pnl_values[-1]
+        ratio_optimal = (pnl_optimal - min_unrealized_profit) / range_pnl
+        expected_coef_optimal = 1.0 + EFFICIENCY.WEIGHT_DEFAULT * (
+            EFFICIENCY.CENTER_DEFAULT - ratio_optimal
+        )
+        self.assertAlmostEqualFloat(
+            coefficients[-1],
+            expected_coef_optimal,
+            tolerance=TOLERANCE.GENERIC_EQ,
+            msg=f"Minimal loss coefficient {coefficients[-1]:.4f} != expected {expected_coef_optimal:.4f}",
          )
+        self.assertLess(
+            coefficients[-1],
+            1.0,
+            "Minimal loss exit should have coefficient < 1.0 (attenuates penalty)",
+        )
+
+        # Verify the final reward semantics: better exit = less negative reward
+        self.assertLess(rewards[0], rewards[-1], "Worst exit should have more negative reward")
  
      def test_exit_reward_never_positive_for_loss_due_to_efficiency(self):
          """Exit reward should not become positive for a loss trade.
diff --git a/ReforceXY/reward_space_analysis/tests/constants.py b/ReforceXY/reward_space_analysis/tests/constants.py

index 8db775b4a20fa3eb8a871f783aee1ba69659b1d9..ba564e754d419b79c6f23f840f775e25fa76c7ef 100644 (file)
--- a/ReforceXY/reward_space_analysis/tests/constants.py
+++ b/ReforceXY/reward_space_analysis/tests/constants.py
@@ -100,6 +100,40 @@ class ExitFactorConfig:
      MIN_POWER_TAU: float = 1e-15
  
  
+@dataclass(frozen=True)
+class EfficiencyConfig:
+    """Efficiency coefficient testing configuration.
+
+    Configuration for exit timing efficiency coefficient validation, including
+    the formula parameters and standard test values.
+
+    The efficiency coefficient modifies exit rewards based on how well the agent
+    timed its exit relative to unrealized PnL extremes during the trade.
+
+    Formula:
+        For profits: coefficient = 1.0 + weight * (ratio - center)
+        For losses:  coefficient = 1.0 + weight * (center - ratio)  [inverted]
+        Where: ratio = (pnl - min_unrealized) / (max_unrealized - min_unrealized)
+
+    Attributes:
+        WEIGHT_DEFAULT: Default efficiency_weight parameter (1.0)
+        CENTER_DEFAULT: Default efficiency_center parameter (0.5)
+        MAX_UNREALIZED_PROFIT: Standard max unrealized profit for profit tests (0.03)
+        MIN_UNREALIZED_PROFIT: Standard min unrealized profit for loss tests (-0.03)
+        PNL_RANGE_PROFIT: Standard PnL range for profit tests: (min, max) tuple
+        PNL_RANGE_LOSS: Standard PnL range for loss tests: (min, max) tuple
+        TRADE_DURATION_DEFAULT: Default trade duration for efficiency tests (10)
+    """
+
+    WEIGHT_DEFAULT: float = 1.0
+    CENTER_DEFAULT: float = 0.5
+    MAX_UNREALIZED_PROFIT: float = 0.03
+    MIN_UNREALIZED_PROFIT: float = -0.03
+    PNL_RANGE_PROFIT: tuple[float, ...] = (0.005, 0.010, 0.015, 0.020, 0.025, 0.029)
+    PNL_RANGE_LOSS: tuple[float, ...] = (-0.029, -0.025, -0.020, -0.015, -0.010, -0.005, -0.001)
+    TRADE_DURATION_DEFAULT: int = 10
+
+
  @dataclass(frozen=True)
  class PBRSConfig:
      """Potential-Based Reward Shaping (PBRS) configuration.
@@ -398,6 +432,7 @@ class StatisticalTolerances:
  # Global singleton instances for easy import
  TOLERANCE: Final[ToleranceConfig] = ToleranceConfig()
  CONTINUITY: Final[ContinuityConfig] = ContinuityConfig()
+EFFICIENCY: Final[EfficiencyConfig] = EfficiencyConfig()
  EXIT_FACTOR: Final[ExitFactorConfig] = ExitFactorConfig()
  PBRS: Final[PBRSConfig] = PBRSConfig()
  STATISTICAL: Final[StatisticalConfig] = StatisticalConfig()
@@ -409,6 +444,7 @@ STAT_TOL: Final[StatisticalTolerances] = StatisticalTolerances()
  
  __all__ = [
      "CONTINUITY",
+    "EFFICIENCY",
      "EXIT_FACTOR",
      "PARAMS",
      "PBRS",
@@ -418,6 +454,7 @@ __all__ = [
      "STAT_TOL",
      "TOLERANCE",
      "ContinuityConfig",
+    "EfficiencyConfig",
      "ExitFactorConfig",
      "PBRSConfig",
      "StatisticalConfig",
diff --git a/ReforceXY/reward_space_analysis/tests/helpers/assertions.py b/ReforceXY/reward_space_analysis/tests/helpers/assertions.py

index 76b6cc191e90e1f861ff65933d9840d50ee168ee..25c1b499f7c6ef3436a7515f38ffdf312c8eb4ec 100644 (file)
--- a/ReforceXY/reward_space_analysis/tests/helpers/assertions.py
+++ b/ReforceXY/reward_space_analysis/tests/helpers/assertions.py
@@ -666,13 +666,13 @@ def assert_exit_mode_mathematical_validation(
      )
      pnl_target = profit_aim * risk_reward_ratio
      pnl_target_coefficient = _compute_pnl_target_coefficient(
-        params, context.pnl, pnl_target, risk_reward_ratio
+        params, context.current_pnl, pnl_target, risk_reward_ratio
      )
-    efficiency_coefficient = _compute_efficiency_coefficient(params, context, context.pnl)
+    efficiency_coefficient = _compute_efficiency_coefficient(params, context, context.current_pnl)
  
      observed_exit_factor = _get_exit_factor(
          base_factor,
-        context.pnl,
+        context.current_pnl,
          pnl_target,
          duration_ratio,
          context,
diff --git a/ReforceXY/reward_space_analysis/tests/pbrs/test_pbrs.py b/ReforceXY/reward_space_analysis/tests/pbrs/test_pbrs.py

index 85dc8e2f4e3fbdcb1403d4142e4eaadf69e3e1fa..cbbc3e516f76cdb6b79bea2032936ca0acf6e583 100644 (file)
--- a/ReforceXY/reward_space_analysis/tests/pbrs/test_pbrs.py
+++ b/ReforceXY/reward_space_analysis/tests/pbrs/test_pbrs.py
@@ -748,7 +748,7 @@ class TestPBRS(RewardSpaceTestBase):
          )
  
          expected_next_potential = _compute_hold_potential(
-            pnl=ctx.pnl,
+            pnl=ctx.current_pnl,
              pnl_target=PARAMS.PROFIT_AIM * PARAMS.RISK_REWARD_RATIO,
              duration_ratio=(trade_duration / max_trade_duration_candles),
              risk_reward_ratio=PARAMS.RISK_REWARD_RATIO,
@@ -958,7 +958,7 @@ class TestPBRS(RewardSpaceTestBase):
  
          current_duration_ratio = ctx.trade_duration / params["max_trade_duration_candles"]
          prev_potential = _compute_hold_potential(
-            ctx.pnl,
+            ctx.current_pnl,
              pnl_target,
              current_duration_ratio,
              PARAMS.RISK_REWARD_RATIO,
diff --git a/ReforceXY/reward_space_analysis/tests/statistics/test_statistics.py b/ReforceXY/reward_space_analysis/tests/statistics/test_statistics.py

index 7487bfbe9d80e9a48b123abdfacb99ba1c2b2ba9..e0c1db2564f9deae117bd996ab169e52e81f9b9b 100644 (file)
--- a/ReforceXY/reward_space_analysis/tests/statistics/test_statistics.py
+++ b/ReforceXY/reward_space_analysis/tests/statistics/test_statistics.py
@@ -171,8 +171,8 @@ class TestStatistics(RewardSpaceTestBase):
          if len(df) > 30:
              idle_data = df[df["idle_duration"] > 0]
              if len(idle_data) > 10:
-                idle_dur = np.asarray(idle_data["idle_duration"], dtype=float)
-                idle_rew = np.asarray(idle_data["reward_idle"], dtype=float)
+                idle_dur = idle_data["idle_duration"].to_numpy(dtype=float)
+                idle_rew = idle_data["reward_idle"].to_numpy(dtype=float)
                  self.assertTrue(
                      len(idle_dur) == len(idle_rew),
                      "Idle duration and reward arrays should have same length",
diff --git a/ReforceXY/reward_space_analysis/tests/test_base.py b/ReforceXY/reward_space_analysis/tests/test_base.py

index 86f3fd02d4bd2796660ab6245ae85c3629fe72a0..4bb26984fed3d83501320e009281fa381fcf3825 100644 (file)
--- a/ReforceXY/reward_space_analysis/tests/test_base.py
+++ b/ReforceXY/reward_space_analysis/tests/test_base.py
@@ -45,7 +45,7 @@ def make_ctx(
  ) -> RewardContext:
      """Create a RewardContext with neutral defaults."""
      return RewardContext(
-        pnl=pnl,
+        current_pnl=pnl,
          trade_duration=trade_duration,
          idle_duration=idle_duration,
          max_unrealized_profit=max_unrealized_profit,
diff --git a/ReforceXY/user_data/freqaimodels/ReforceXY.py b/ReforceXY/user_data/freqaimodels/ReforceXY.py

index 897474128e4fdf5828eaa556af6036ea7e1460ae..ebc7afd8ffec94437ebf05d418520e55e7118429 100644 (file)
--- a/ReforceXY/user_data/freqaimodels/ReforceXY.py
+++ b/ReforceXY/user_data/freqaimodels/ReforceXY.py
@@ -1372,13 +1372,13 @@ class ReforceXY(BaseReinforcementLearningModel):
              )
              hyperopt_failed = True
          time_spent = time.time() - start_time
-        n_complete = len([t for t in study.trials if t.state == TrialState.COMPLETE])
+        n_completed = len([t for t in study.trials if t.state == TrialState.COMPLETE])
          n_pruned = len([t for t in study.trials if t.state == TrialState.PRUNED])
          n_failed = len([t for t in study.trials if t.state == TrialState.FAIL])
          logger.info(
-            "Hyperopt %s: %s complete, %s pruned, %s failed trials",
+            "Hyperopt %s: %s completed, %s pruned, %s failed trials",
              study_name,
-            n_complete,
+            n_completed,
              n_pruned,
              n_failed,
          )
@@ -1975,31 +1975,46 @@ class MyRLEnv(Base5ActionRLEnv):
          self,
          action: int,
          trade_duration: float,
-        pnl: float,
+        current_pnl: float,
      ) -> Tuple[Positions, int, float]:
-        """Compute next transition state tuple."""
+        """Compute next transition state tuple (next_position, next_duration, next_pnl).
+
+        Parameters
+        ----------
+        action : int
+            Action taken by the agent.
+        trade_duration : float
+            Trade duration at current tick.
+        current_pnl : float
+            Unrealized PnL at current tick.
+
+        Returns
+        -------
+        tuple[Positions, int, float]
+            (next_position, next_trade_duration, next_pnl) for the transition s -> s'.
+        """
          next_position = self._get_next_position(action)
  
-        # Entry
+        # Entry: Neutral -> Long/Short
          if self._position == Positions.Neutral and next_position in (
              Positions.Long,
              Positions.Short,
          ):
              return next_position, 0, self._get_entry_unrealized_profit(next_position)
  
-        # Exit
+        # Exit: Long/Short -> Neutral
          if (
              self._position in (Positions.Long, Positions.Short)
              and next_position == Positions.Neutral
          ):
              return next_position, 0, 0.0
  
-        # Hold
+        # Hold: Long/Short -> Long/Short
          if self._position in (Positions.Long, Positions.Short) and next_position in (
              Positions.Long,
              Positions.Short,
          ):
-            return next_position, int(trade_duration), pnl
+            return next_position, int(trade_duration), current_pnl
  
          # Neutral self-loop
          return next_position, 0, 0.0
@@ -2258,7 +2273,7 @@ class MyRLEnv(Base5ActionRLEnv):
          action: int,
          trade_duration: float,
          max_trade_duration: float,
-        pnl: float,
+        current_pnl: float,
          pnl_target: float,
          hold_potential_scale: float,
          entry_additive_scale: float,
@@ -2395,11 +2410,13 @@ class MyRLEnv(Base5ActionRLEnv):
          action : int
              Action taken: determines transition type (entry/hold/exit)
          trade_duration : float
-            Current trade duration in candles (for current state s)
+            Trade duration at current tick.
+            This is the duration for state s'.
          max_trade_duration : float
              Maximum allowed trade duration (for normalization)
-        pnl : float
-            Current position PnL (for current state s)
+        current_pnl : float
+            Unrealized PnL at current tick.
+            This is the PnL for state s'.
          pnl_target : float
              Target PnL for ratio normalization: r_pnl = pnl / pnl_target
          hold_potential_scale : float
@@ -2449,7 +2466,7 @@ class MyRLEnv(Base5ActionRLEnv):
              return 0.0, 0.0, 0.0
  
          next_position, next_trade_duration, next_pnl = self._get_next_transition_state(
-            action=action, trade_duration=trade_duration, pnl=pnl
+            action=action, trade_duration=trade_duration, current_pnl=current_pnl
          )
          if max_trade_duration <= 0:
              next_duration_ratio = 0.0
@@ -2520,7 +2537,7 @@ class MyRLEnv(Base5ActionRLEnv):
              if self._exit_additive_enabled and not self.is_pbrs_invariant_mode():
                  duration_ratio = trade_duration / max(1, max_trade_duration)
                  exit_additive = self._compute_exit_additive(
-                    pnl, pnl_target, duration_ratio, exit_additive_scale
+                    current_pnl, pnl_target, duration_ratio, exit_additive_scale
                  )
                  self._total_exit_additive += float(exit_additive)
  
@@ -2988,7 +3005,7 @@ class MyRLEnv(Base5ActionRLEnv):
              action=action,
              trade_duration=trade_duration,
              max_trade_duration=max_trade_duration,
-            pnl=pnl,
+            current_pnl=pnl,
              pnl_target=self._pnl_target,
              hold_potential_scale=hold_potential_scale,
              entry_additive_scale=entry_additive_scale,
author	Jérôme Benoit <jerome.benoit@piment-noir.org>
	Sat, 27 Dec 2025 01:12:16 +0000 (02:12 +0100)
committer	Jérôme Benoit <jerome.benoit@piment-noir.org>
	Sat, 27 Dec 2025 01:12:16 +0000 (02:12 +0100)
ReforceXY/reward_space_analysis/reward_space_analysis.py		patch \| blob \| blame \| history
ReforceXY/reward_space_analysis/tests/components/test_reward_components.py		patch \| blob \| blame \| history
ReforceXY/reward_space_analysis/tests/constants.py		patch \| blob \| blame \| history
ReforceXY/reward_space_analysis/tests/helpers/assertions.py		patch \| blob \| blame \| history
ReforceXY/reward_space_analysis/tests/pbrs/test_pbrs.py		patch \| blob \| blame \| history
ReforceXY/reward_space_analysis/tests/statistics/test_statistics.py		patch \| blob \| blame \| history
ReforceXY/reward_space_analysis/tests/test_base.py		patch \| blob \| blame \| history
ReforceXY/user_data/freqaimodels/ReforceXY.py		patch \| blob \| blame \| history