fix(ReforceXY): fix off-by-one in PBRS next state duration computation

author Jérôme Benoit <jerome.benoit@piment-noir.org>

Mon, 22 Dec 2025 12:37:22 +0000 (13:37 +0100)

committer Jérôme Benoit <jerome.benoit@piment-noir.org>

Mon, 22 Dec 2025 12:37:22 +0000 (13:37 +0100)
author Jérôme Benoit <jerome.benoit@piment-noir.org>
Mon, 22 Dec 2025 12:37:22 +0000 (13:37 +0100)
committer Jérôme Benoit <jerome.benoit@piment-noir.org>
Mon, 22 Dec 2025 12:37:22 +0000 (13:37 +0100)
diff --git a/ReforceXY/reward_space_analysis/README.md b/ReforceXY/reward_space_analysis/README.md

index 3b00794924fbd305738d71e12e4730cc6e111b64..06230e26e4da782f8c94e9cb6c434a29e12943b1 100644 (file)
--- a/ReforceXY/reward_space_analysis/README.md
+++ b/ReforceXY/reward_space_analysis/README.md
@@ -320,6 +320,8 @@ where `kernel_function` depends on `exit_attenuation_mode`. See [Exit Attenuatio
  | `exit_potential_mode`    | canonical | Potential release mode            |
  | `exit_potential_decay`   | 0.5       | Decay for progressive_release     |
  | `hold_potential_enabled` | true      | Enable hold potential Φ           |
+| `entry_fee_rate`         | 0.0       | Entry fee rate                    |
+| `exit_fee_rate`          | 0.0       | Exit fee rate                     |
  
  PBRS invariance holds when: `exit_potential_mode=canonical`.
  
diff --git a/ReforceXY/reward_space_analysis/reward_space_analysis.py b/ReforceXY/reward_space_analysis/reward_space_analysis.py

index e4e8d035ef929ef08c1016e8c57b50271c76a308..ee9720fdfb6adc80e8540443eb1716362bcd1cef 100644 (file)
--- a/ReforceXY/reward_space_analysis/reward_space_analysis.py
+++ b/ReforceXY/reward_space_analysis/reward_space_analysis.py
@@ -153,6 +153,8 @@ DEFAULT_MODEL_REWARD_PARAMETERS: RewardParams = {
      "entry_additive_gain": 1.0,
      "entry_additive_transform_pnl": "tanh",
      "entry_additive_transform_duration": "tanh",
+    "entry_fee_rate": 0.0,
+    "exit_fee_rate": 0.0,
      # Exit additive (non-PBRS additive term)
      "exit_additive_enabled": False,
      "exit_additive_scale": 1.0,
@@ -196,6 +198,8 @@ DEFAULT_MODEL_REWARD_PARAMETERS_HELP: Dict[str, str] = {
      "entry_additive_gain": "Entry additive gain",
      "entry_additive_transform_pnl": "Entry PnL transform",
      "entry_additive_transform_duration": "Entry duration transform",
+    "entry_fee_rate": "Entry fee rate",
+    "exit_fee_rate": "Exit fee rate",
      "exit_additive_enabled": "Enable exit additive",
      "exit_additive_scale": "Exit additive scale",
      "exit_additive_gain": "Exit additive gain",
@@ -233,6 +237,8 @@ _PARAMETER_BOUNDS: Dict[str, Dict[str, float]] = {
      "hold_potential_gain": {"min": 0.0},
      "entry_additive_scale": {"min": 0.0},
      "entry_additive_gain": {"min": 0.0},
+    "entry_fee_rate": {"min": 0.0, "max": 0.1},
+    "exit_fee_rate": {"min": 0.0, "max": 0.1},
      "exit_additive_scale": {"min": 0.0},
      "exit_additive_gain": {"min": 0.0},
  }
@@ -1174,11 +1180,16 @@ def calculate_reward(
      is_neutral = context.position == Positions.Neutral and context.action == Actions.Neutral
  
      if is_entry:
-        next_pnl = current_pnl
          next_duration_ratio = 0.0
+        if context.action == Actions.Long_enter:
+            next_pnl = _compute_entry_unrealized_pnl_estimate(Positions.Long, params)
+        elif context.action == Actions.Short_enter:
+            next_pnl = _compute_entry_unrealized_pnl_estimate(Positions.Short, params)
+        else:
+            next_pnl = current_pnl
      elif is_hold:
          next_duration_ratio = _compute_duration_ratio(
-            context.trade_duration + 1, max_trade_duration_candles
+            context.trade_duration, max_trade_duration_candles
          )
          # Optionally simulate unrealized PnL during holds to feed Φ(s)
          if _get_bool_param(params, "unrealized_pnl", False):
@@ -1468,10 +1479,16 @@ def simulate_samples(
                  position = Positions.Long
                  trade_duration = 0
                  idle_duration = 0
+                pnl = _compute_entry_unrealized_pnl_estimate(Positions.Long, params)
+                max_unrealized_profit = pnl
+                min_unrealized_profit = pnl
              elif action == Actions.Short_enter and short_allowed:
                  position = Positions.Short
                  trade_duration = 0
                  idle_duration = 0
+                pnl = _compute_entry_unrealized_pnl_estimate(Positions.Short, params)
+                max_unrealized_profit = pnl
+                min_unrealized_profit = pnl
          else:
              idle_duration = 0
              if action == Actions.Neutral:
@@ -2773,6 +2790,66 @@ def _get_potential_gamma(params: RewardParams) -> float:
  # === PBRS IMPLEMENTATION ===
  
  
+def _compute_entry_unrealized_pnl_estimate(next_position: Positions, params: RewardParams) -> float:
+    """Estimate immediate unrealized PnL after entry fees.
+
+    For Long entry:
+        current_price = open * (1 - exit_fee_rate)
+        last_trade_price = open * (1 + entry_fee_rate)
+        pnl = (current_price - last_trade_price) / last_trade_price
+
+    For Short entry:
+        current_price = open * (1 + entry_fee_rate)
+        last_trade_price = open * (1 - exit_fee_rate)
+        pnl = (last_trade_price - current_price) / last_trade_price
+    """
+
+    entry_fee_rate = _get_float_param(
+        params,
+        "entry_fee_rate",
+        DEFAULT_MODEL_REWARD_PARAMETERS.get("entry_fee_rate", 0.0),
+    )
+    exit_fee_rate = _get_float_param(
+        params,
+        "exit_fee_rate",
+        DEFAULT_MODEL_REWARD_PARAMETERS.get("exit_fee_rate", 0.0),
+    )
+
+    if not np.isfinite(entry_fee_rate):
+        entry_fee_rate = 0.0
+    if not np.isfinite(exit_fee_rate):
+        exit_fee_rate = 0.0
+
+    entry_fee_bounds = _PARAMETER_BOUNDS.get("entry_fee_rate", {"min": 0.0, "max": 1.0})
+    exit_fee_bounds = _PARAMETER_BOUNDS.get("exit_fee_rate", {"min": 0.0, "max": 1.0})
+
+    entry_fee_min = float(entry_fee_bounds.get("min", 0.0))
+    entry_fee_max = float(entry_fee_bounds.get("max", 1.0))
+    exit_fee_min = float(exit_fee_bounds.get("min", 0.0))
+    exit_fee_max = float(exit_fee_bounds.get("max", 1.0))
+
+    entry_fee_rate = float(np.clip(entry_fee_rate, entry_fee_min, entry_fee_max))
+    exit_fee_rate = float(np.clip(exit_fee_rate, exit_fee_min, exit_fee_max))
+
+    current_open = 1.0
+    next_pnl = 0.0
+
+    if next_position == Positions.Long:
+        current_price = current_open * (1.0 - exit_fee_rate)
+        last_trade_price = current_open * (1.0 + entry_fee_rate)
+        if last_trade_price != 0.0 and np.isfinite(last_trade_price):
+            next_pnl = (current_price - last_trade_price) / last_trade_price
+    elif next_position == Positions.Short:
+        current_price = current_open * (1.0 + entry_fee_rate)
+        last_trade_price = current_open * (1.0 - exit_fee_rate)
+        if last_trade_price != 0.0 and np.isfinite(last_trade_price):
+            next_pnl = (last_trade_price - current_price) / last_trade_price
+
+    if not np.isfinite(next_pnl):
+        return 0.0
+    return float(next_pnl)
+
+
  def _compute_hold_potential(
      pnl: float,
      pnl_target: float,
diff --git a/ReforceXY/reward_space_analysis/tests/.docstring_template.md b/ReforceXY/reward_space_analysis/tests/.docstring_template.md

index 0551814f81b0675db09e45fb4599419d3b058b93..2c8e7054214b927bba1cf618417275ecaf85ff8e 100644 (file)
--- a/ReforceXY/reward_space_analysis/tests/.docstring_template.md
+++ b/ReforceXY/reward_space_analysis/tests/.docstring_template.md
@@ -199,13 +199,12 @@ def test_property_holds_for_all_inputs(self):
  ### Regression Test
  
  ```python
-def test_bug_fix_issue_123_no_regression(self):
+def test_issue_123_regression_is_prevented(self):
      """Regression test for Issue #123: [brief description].
  
-    Ensures fix for [bug description] remains effective.
-    Bug manifested when [conditions]; this test reproduces those conditions.
+    Validates behavior for [conditions] and asserts the expected outcome.
  
-    **Fixed in:** PR #456 (commit abc1234)
+    **Tracking:** Issue #123
      """
  ```
  
diff --git a/ReforceXY/reward_space_analysis/tests/README.md b/ReforceXY/reward_space_analysis/tests/README.md

index c3d15970da7737b6748deb7c1c1d22c6dc381d42..a3605a59d43df2097020defbd022735c08ff0bdd 100644 (file)
--- a/ReforceXY/reward_space_analysis/tests/README.md
+++ b/ReforceXY/reward_space_analysis/tests/README.md
@@ -194,7 +194,7 @@ Columns:
  | statistics-constant-dist-widened-ci-113a     | statistics  | Non-strict: widened CI with warning                                                 | statistics/test_statistics.py:533         | Test docstring labels "Invariant 113 (non-strict)"                                                                          |
  | statistics-constant-dist-strict-omit-113b    | statistics  | Strict: omit metrics (no widened CI)                                                | statistics/test_statistics.py:565         | Test docstring labels "Invariant 113 (strict)"                                                                              |
  | statistics-fallback-diagnostics-115          | statistics  | Fallback diagnostics constant distribution (qq_r2=1.0 etc.)                         | statistics/test_statistics.py:190         | Docstring line                                                                                                              |
-| robustness-exit-pnl-only-117                 | robustness  | Only exit actions have non-zero PnL                                                 | robustness/test_robustness.py:126         | Newly assigned ID (previously unnumbered)                                                                                   |
+| robustness-exit-pnl-only-117                 | robustness  | Only exit actions have non-zero PnL                                                 | robustness/test_robustness.py:126         | Newly assigned ID                                                                                                           |
  | pbrs-absence-shift-placeholder-118           | pbrs        | Placeholder shift line present (absence displayed)                                  | pbrs/test_pbrs.py:979                     | Ensures placeholder appears when shaping shift absent                                                                       |
  | components-pbrs-breakdown-fields-119         | components  | PBRS breakdown fields finite and mathematically aligned                             | components/test_reward_components.py:454  | Tests base_reward, pbrs_delta, invariance_correction fields and their alignment                                             |
  | integration-pbrs-metrics-section-120         | integration | PBRS Metrics section present in report with tracing metrics                         | integration/test_report_formatting.py:156 | Verifies PBRS Metrics (Tracing) subsection rendering in statistical_analysis.md                                             |
diff --git a/ReforceXY/reward_space_analysis/tests/pbrs/test_pbrs.py b/ReforceXY/reward_space_analysis/tests/pbrs/test_pbrs.py

index ef7b3562958a0368f74aec18dadcb285cd7cbdd9..fb003250781bb248b5a83839c242ced1b4bcda31 100644 (file)
--- a/ReforceXY/reward_space_analysis/tests/pbrs/test_pbrs.py
+++ b/ReforceXY/reward_space_analysis/tests/pbrs/test_pbrs.py
@@ -13,12 +13,16 @@ from reward_space_analysis import (
      DEFAULT_IDLE_DURATION_MULTIPLIER,
      DEFAULT_MODEL_REWARD_PARAMETERS,
      PBRS_INVARIANCE_TOL,
+    Actions,
+    Positions,
      _compute_entry_additive,
+    _compute_entry_unrealized_pnl_estimate,
      _compute_exit_additive,
      _compute_exit_potential,
      _compute_hold_potential,
      _get_float_param,
      apply_potential_shaping,
+    calculate_reward,
      get_max_idle_duration_candles,
      simulate_samples,
      validate_reward_parameters,
@@ -485,6 +489,153 @@ class TestPBRS(RewardSpaceTestBase):
              "Unexpected total difference under gamma NaN fallback",
          )
  
+    def test_calculate_reward_entry_next_pnl_fee_aware(self):
+        """calculate_reward() entry PBRS uses fee-aware next_pnl estimate."""
+        params = self.base_params(
+            exit_potential_mode="non_canonical",
+            hold_potential_enabled=True,
+            entry_additive_enabled=False,
+            exit_additive_enabled=False,
+            potential_gamma=0.9,
+            entry_fee_rate=0.001,
+            exit_fee_rate=0.001,
+        )
+
+        for action in (Actions.Long_enter, Actions.Short_enter):
+            ctx = self.make_ctx(
+                position=Positions.Neutral, action=action, pnl=0.0, trade_duration=0
+            )
+            breakdown = calculate_reward(
+                ctx,
+                params,
+                base_factor=PARAMS.BASE_FACTOR,
+                profit_aim=PARAMS.PROFIT_AIM,
+                risk_reward_ratio=PARAMS.RISK_REWARD_RATIO,
+                short_allowed=True,
+                action_masking=True,
+                prev_potential=0.0,
+            )
+            self.assertTrue(np.isfinite(breakdown.next_potential))
+            # With any nonzero fees, immediate unrealized pnl should be negative.
+            self.assertLess(
+                breakdown.next_potential,
+                0.0,
+                f"Expected negative next_potential on entry for action={action}",
+            )
+
+    def test_fee_rates_are_clamped_to_parameter_bounds(self):
+        """Fee clamping uses _PARAMETER_BOUNDS (max 0.1)."""
+        cases = [
+            ("entry_fee_rate", self.base_params(entry_fee_rate=999.0, exit_fee_rate=0.0)),
+            ("exit_fee_rate", self.base_params(entry_fee_rate=0.0, exit_fee_rate=999.0)),
+        ]
+
+        for key, params in cases:
+            pnl_clamped = _compute_entry_unrealized_pnl_estimate(Positions.Long, params)
+            pnl_expected = _compute_entry_unrealized_pnl_estimate(
+                Positions.Long,
+                {**params, key: 0.1},
+            )
+            self.assertAlmostEqualFloat(
+                pnl_clamped,
+                pnl_expected,
+                tolerance=TOLERANCE.IDENTITY_STRICT,
+                msg=f"Expected {key} values above max to clamp to 0.1",
+            )
+
+    def test_simulate_samples_initializes_pnl_on_entry(self):
+        """simulate_samples() sets in-position pnl to entry fee estimate."""
+        params = self.base_params(
+            exit_potential_mode="non_canonical",
+            hold_potential_enabled=True,
+            entry_additive_enabled=False,
+            exit_additive_enabled=False,
+            entry_fee_rate=0.001,
+            exit_fee_rate=0.001,
+        )
+
+        df = simulate_samples(
+            num_samples=50,
+            seed=1,
+            params=params,
+            base_factor=PARAMS.BASE_FACTOR,
+            profit_aim=PARAMS.PROFIT_AIM,
+            risk_reward_ratio=PARAMS.RISK_REWARD_RATIO,
+            max_duration_ratio=1.0,
+            trading_mode="futures",
+            pnl_base_std=0.0,
+            pnl_duration_vol_scale=0.0,
+        )
+
+        enter_rows = df[df["action"] == float(Actions.Long_enter.value)]
+        self.assertGreater(len(enter_rows), 0, "Expected at least one Long_enter in sample")
+
+        enter_pos = df.reset_index(drop=True)
+        enter_mask = enter_pos["action"].to_numpy() == float(Actions.Long_enter.value)
+        enter_positions = np.flatnonzero(enter_mask)
+        first_enter_pos = int(enter_positions[0])
+        next_pos = first_enter_pos + 1
+
+        self.assertLess(next_pos, len(enter_pos), "Sample must include post-entry step")
+        self.assertEqual(
+            float(enter_pos.iloc[next_pos]["position"]),
+            float(Positions.Long.value),
+            "Expected Long position immediately after Long_enter",
+        )
+
+        expected_pnl = _compute_entry_unrealized_pnl_estimate(Positions.Long, params)
+        post_entry_pnl = float(enter_pos.iloc[next_pos]["pnl"])
+        self.assertAlmostEqualFloat(
+            post_entry_pnl,
+            expected_pnl,
+            tolerance=TOLERANCE.IDENTITY_STRICT,
+            msg="Expected pnl after entry to match entry fee estimate",
+        )
+
+    def test_calculate_reward_hold_uses_current_duration_ratio(self):
+        """calculate_reward() hold next_duration_ratio uses trade_duration."""
+        params = self.base_params(
+            exit_potential_mode="non_canonical",
+            hold_potential_enabled=True,
+            entry_additive_enabled=False,
+            exit_additive_enabled=False,
+            potential_gamma=0.9,
+        )
+
+        trade_duration = 5
+        max_trade_duration_candles = 10
+
+        ctx = self.make_ctx(
+            position=Positions.Long,
+            action=Actions.Neutral,
+            pnl=0.01,
+            trade_duration=trade_duration,
+        )
+
+        breakdown = calculate_reward(
+            ctx,
+            {**params, "max_trade_duration_candles": max_trade_duration_candles},
+            base_factor=PARAMS.BASE_FACTOR,
+            profit_aim=PARAMS.PROFIT_AIM,
+            risk_reward_ratio=PARAMS.RISK_REWARD_RATIO,
+            short_allowed=True,
+            action_masking=True,
+            prev_potential=0.0,
+        )
+
+        expected_next_potential = _compute_hold_potential(
+            pnl=ctx.pnl,
+            pnl_target=PARAMS.PROFIT_AIM * PARAMS.RISK_REWARD_RATIO,
+            duration_ratio=(trade_duration / max_trade_duration_candles),
+            params=params,
+        )
+        self.assertAlmostEqualFloat(
+            breakdown.next_potential,
+            expected_next_potential,
+            tolerance=TOLERANCE.IDENTITY_RELAXED,
+            msg="Hold next_potential mismatch (duration ratio mismatch)",
+        )
+
      # ---------------- Validation parameter batch & relaxed aggregation ---------------- #
  
      def test_validate_reward_parameters_batch_and_relaxed_aggregation(self):
diff --git a/ReforceXY/user_data/freqaimodels/ReforceXY.py b/ReforceXY/user_data/freqaimodels/ReforceXY.py

index 3086750d655ae0217c43064d23a5c86a8fc07884..c69f63d5963915c7073edf7e0eac085f6fa26ce0 100644 (file)
--- a/ReforceXY/user_data/freqaimodels/ReforceXY.py
+++ b/ReforceXY/user_data/freqaimodels/ReforceXY.py
@@ -1793,6 +1793,29 @@ class MyRLEnv(Base5ActionRLEnv):
              return Positions.Neutral
          return self._position
  
+    def _get_entry_unrealized_profit(self, next_position: Positions) -> float:
+        current_open = self.prices.iloc[self._current_tick].open
+        if not isinstance(current_open, (int, float, np.floating)) or not np.isfinite(
+            current_open
+        ):
+            return 0.0
+
+        next_pnl = 0.0
+        if next_position == Positions.Long:
+            current_price = self.add_exit_fee(current_open)
+            last_trade_price = self.add_entry_fee(current_open)
+            if not np.isclose(last_trade_price, 0.0) and np.isfinite(last_trade_price):
+                next_pnl = (current_price - last_trade_price) / last_trade_price
+        elif next_position == Positions.Short:
+            current_price = self.add_entry_fee(current_open)
+            last_trade_price = self.add_exit_fee(current_open)
+            if not np.isclose(last_trade_price, 0.0) and np.isfinite(last_trade_price):
+                next_pnl = (last_trade_price - current_price) / last_trade_price
+
+        if not np.isfinite(next_pnl):
+            return 0.0
+        return float(next_pnl)
+
      def _get_next_transition_state(
          self,
          action: int,
@@ -1801,24 +1824,28 @@ class MyRLEnv(Base5ActionRLEnv):
      ) -> Tuple[Positions, int, float]:
          """Compute next transition state tuple."""
          next_position = self._get_next_position(action)
+
          # Entry
          if self._position == Positions.Neutral and next_position in (
              Positions.Long,
              Positions.Short,
          ):
-            return next_position, 0, pnl
+            return next_position, 0, self._get_entry_unrealized_profit(next_position)
+
          # Exit
          if (
              self._position in (Positions.Long, Positions.Short)
              and next_position == Positions.Neutral
          ):
              return next_position, 0, 0.0
+
          # Hold
          if self._position in (Positions.Long, Positions.Short) and next_position in (
              Positions.Long,
              Positions.Short,
          ):
-            return next_position, int(trade_duration) + 1, pnl
+            return next_position, int(trade_duration), pnl
+
          # Neutral self-loop
          return next_position, 0, 0.0
author	Jérôme Benoit <jerome.benoit@piment-noir.org>
	Mon, 22 Dec 2025 12:37:22 +0000 (13:37 +0100)
committer	Jérôme Benoit <jerome.benoit@piment-noir.org>
	Mon, 22 Dec 2025 12:37:22 +0000 (13:37 +0100)
ReforceXY/reward_space_analysis/README.md		patch \| blob \| blame \| history
ReforceXY/reward_space_analysis/reward_space_analysis.py		patch \| blob \| blame \| history
ReforceXY/reward_space_analysis/tests/.docstring_template.md		patch \| blob \| blame \| history
ReforceXY/reward_space_analysis/tests/README.md		patch \| blob \| blame \| history
ReforceXY/reward_space_analysis/tests/pbrs/test_pbrs.py		patch \| blob \| blame \| history
ReforceXY/user_data/freqaimodels/ReforceXY.py		patch \| blob \| blame \| history