refactor(ReforceXY): PBRS refactoring, bug fix, and documentation harmonization

author Jérôme Benoit <jerome.benoit@piment-noir.org>

Tue, 23 Dec 2025 19:13:18 +0000 (20:13 +0100)

committer Jérôme Benoit <jerome.benoit@piment-noir.org>

Tue, 23 Dec 2025 19:13:18 +0000 (20:13 +0100)
author Jérôme Benoit <jerome.benoit@piment-noir.org>
Tue, 23 Dec 2025 19:13:18 +0000 (20:13 +0100)
committer Jérôme Benoit <jerome.benoit@piment-noir.org>
Tue, 23 Dec 2025 19:13:18 +0000 (20:13 +0100)
diff --git a/ReforceXY/reward_space_analysis/reward_space_analysis.py b/ReforceXY/reward_space_analysis/reward_space_analysis.py

index 21ea4ea121aafd15b8e4a432130899088b0789e1..b40fa73beaf4084ca04bbb4446d80f84ee8d85c4 100644 (file)
--- a/ReforceXY/reward_space_analysis/reward_space_analysis.py
+++ b/ReforceXY/reward_space_analysis/reward_space_analysis.py
@@ -732,7 +732,7 @@ class RewardBreakdown:
      next_potential: float = 0.0
      # PBRS helpers
      base_reward: float = 0.0
-    pbrs_delta: float = 0.0  # Δ(s,s') = γ·Φ(s') − Φ(s)
+    pbrs_delta: float = 0.0  # Δ(s,a,s') = γ·Φ(s') − Φ(s)
      invariance_correction: float = 0.0
  
  
@@ -1192,7 +1192,7 @@ def calculate_reward(
          )
          base_reward = breakdown.invalid_penalty
  
-    factor = _get_float_param(params, "base_factor", base_factor)
+    base_factor = _get_float_param(params, "base_factor", base_factor)
  
      if "profit_aim" in params:
          profit_aim = _get_float_param(params, "profit_aim", float(profit_aim))
@@ -1202,7 +1202,7 @@ def calculate_reward(
  
      pnl_target = float(profit_aim * risk_reward_ratio)
  
-    idle_factor = factor * (profit_aim / 4.0)
+    idle_factor = base_factor * (profit_aim / risk_reward_ratio) / 4.0
      hold_factor = idle_factor
  
      max_trade_duration_candles = _get_int_param(
@@ -1231,7 +1231,7 @@ def calculate_reward(
              ) or (context.action == Actions.Short_exit and context.position == Positions.Short)
              if is_exit_action:
                  base_reward = _compute_exit_reward(
-                    factor,
+                    base_factor,
                      pnl_target,
                      current_duration_ratio,
                      context,
@@ -1354,9 +1354,8 @@ def calculate_reward(
              breakdown.total = base_reward
              return breakdown
  
-        total_reward, reward_shaping, next_potential, pbrs_delta, entry_additive, exit_additive = (
-            apply_potential_shaping(
-                base_reward=base_reward,
+        reward_shaping, next_potential, pbrs_delta, entry_additive, exit_additive = (
+            compute_pbrs_components(
                  current_pnl=current_pnl,
                  pnl_target=pnl_target,
                  current_duration_ratio=current_duration_ratio,
@@ -1376,7 +1375,7 @@ def calculate_reward(
          breakdown.exit_additive = exit_additive
          breakdown.pbrs_delta = pbrs_delta
          breakdown.invariance_correction = reward_shaping - pbrs_delta
-        breakdown.total = total_reward
+        breakdown.total = base_reward + reward_shaping + entry_additive + exit_additive
          return breakdown
  
      prev_potential = float(prev_potential) if np.isfinite(prev_potential) else 0.0
@@ -3260,8 +3259,7 @@ def _compute_exit_potential(prev_potential: float, params: RewardParams) -> floa
      return float(next_potential)
  
  
-def apply_potential_shaping(
-    base_reward: float,
+def compute_pbrs_components(
      current_pnl: float,
      pnl_target: float,
      current_duration_ratio: float,
@@ -3272,28 +3270,193 @@ def apply_potential_shaping(
      is_exit: bool = False,
      is_entry: bool = False,
      prev_potential: float,
-) -> tuple[float, float, float, float, float, float]:
-    """Compute shaped reward using PBRS.
+) -> tuple[float, float, float, float, float]:
+    """Compute potential-based reward shaping (PBRS) components.
+
+    This function computes the PBRS shaping terms without combining them with the base reward,
+    allowing the caller to construct the total reward as R'(s,a,s') = R(s,a,s') + Δ(s,a,s') + additives.
+
+    Canonical PBRS Formula
+    ----------------------
+    R'(s,a,s') = R(s,a,s') + γ·Φ(s') - Φ(s)
+
+    where:
+        Δ(s,a,s') = γ·Φ(s') - Φ(s)  (PBRS shaping term)
+
+    Notation
+    --------
+    **States & Actions:**
+        s     : current state
+        s'    : next state
+        a     : action
+
+    **Reward Components:**
+        R(s,a,s')     : base reward
+        R'(s,a,s')    : shaped reward
+        Δ(s,a,s')     : PBRS shaping term = γ·Φ(s') - Φ(s)
+
+    **Potential Function:**
+        Φ(s)          : potential at state s
+        γ             : discount factor for shaping (gamma)
+
+    **State Variables:**
+        r_pnl         : pnl / pnl_target (PnL ratio)
+        r_dur         : duration / max_duration (duration ratio, clamp [0,1])
+        g             : gain parameter
+        T_x           : transform function (tanh, softsign, etc.)
+
+    **Potential Formula:**
+        Φ(s) = scale · 0.5 · [T_pnl(g·r_pnl) + sgn(r_pnl)·T_dur(g·r_dur)]
+
+    PBRS Theory & Compliance
+    ------------------------
+    - Ng et al. 1999: potential-based shaping preserves optimal policy
+    - Wiewiora et al. 2003: terminal states must have Φ(terminal) = 0
+    - Invariance holds ONLY in canonical mode with additives disabled
+    - Theorem: Canonical + no additives ⇒ Σ_t γ^t·Δ_t = 0 over episodes
+
+    Architecture & Transitions
+    --------------------------
+    **Three mutually exclusive transition types:**
+
+    1. **Entry** (Neutral → Long/Short):
+       - Φ(s) = 0 (neutral state has no potential)
+       - Φ(s') = hold_potential(s')
+       - Δ(s,a,s') = γ·Φ(s') - 0 = γ·Φ(s')
+       - Optional entry additive (breaks invariance)
+
+    2. **Hold** (Long/Short → Long/Short):
+       - Φ(s) = hold_potential(s)
+       - Φ(s') = hold_potential(s')
+       - Δ(s,a,s') = γ·Φ(s') - Φ(s)
+       - Φ(s') reflects updated PnL and duration
+
+    3. **Exit** (Long/Short → Neutral):
+       - Φ(s) = hold_potential(s)
+       - Φ(s') depends on exit_potential_mode:
+         * **canonical**: Φ(s') = 0 → Δ = -Φ(s)
+         * **heuristic**: Φ(s') = f(Φ(s)) → Δ = γ·Φ(s') - Φ(s)
+       - Optional exit additive (breaks invariance)
+
+    Exit Potential Modes
+    --------------------
+    **canonical** (PBRS-compliant):
+        Φ(s') = 0
+        Δ = γ·0 - Φ(s) = -Φ(s)
+        Additives disabled automatically
+
+    **non_canonical**:
+        Φ(s') = 0
+        Δ = -Φ(s)
+        Additives allowed (breaks invariance)
+
+    **progressive_release** (heuristic):
+        Φ(s') = Φ(s)·(1 - d)  where d = decay_factor
+        Δ = γ·Φ(s)·(1-d) - Φ(s)
+
+    **spike_cancel** (heuristic):
+        Φ(s') = Φ(s)/γ
+        Δ = γ·(Φ(s)/γ) - Φ(s) = 0
+
+    **retain_previous** (heuristic):
+        Φ(s') = Φ(s)
+        Δ = γ·Φ(s) - Φ(s) = (γ-1)·Φ(s)
+
+    Additive Terms (Non-PBRS)
+    --------------------------
+    Entry and exit additives are **optional bonuses** that break PBRS invariance:
+    - Entry additive: applied on Neutral→Long/Short transitions
+    - Exit additive: applied on Long/Short→Neutral transitions
+    - These do NOT persist in Φ(s) storage
+
+    Invariance & Validation
+    -----------------------
+    **Theoretical Guarantee:**
+        Canonical + no additives ⇒ Σ_t γ^t·Δ_t = 0
+        (Φ(start) = Φ(end) = 0)
+
+    **Deviations from Theory:**
+        - Heuristic exit modes violate invariance
+        - Entry/exit additives break policy invariance
+        - Non-canonical modes introduce path dependence
+
+    **Robustness:**
+        - All transforms bounded: |T_x| ≤ 1
+        - Validation: |Φ(s)| ≤ scale
+        - Bounds: |Δ(s,a,s')| ≤ (1+γ)·scale
+        - Terminal enforcement: Φ(s) = 0 when terminated
+
+    Implementation Details
+    ----------------------
+    This is a stateless pure function for analysis and testing:
+    - All state (Φ(s), γ, configuration) passed explicitly as parameters
+    - Returns diagnostic values (next_potential, pbrs_delta) for inspection
+    - Does not mutate any inputs
+    - Suitable for batch processing and unit testing
+
+    For production RL environment use, see ReforceXY._compute_pbrs_components()
+    which wraps this logic with stateful management (self._last_potential, etc.)
+
+    Parameters
+    ----------
+    current_pnl : float
+        Current state s PnL
+    pnl_target : float
+        Target PnL for ratio normalization: r_pnl = pnl / pnl_target
+    current_duration_ratio : float
+        Current state s duration ratio [0,1]: r_dur = duration / max_duration
+    next_pnl : float
+        Next state s' PnL (after action)
+    next_duration_ratio : float
+        Next state s' duration ratio [0,1]
+    params : RewardParams
+        Configuration dictionary with keys:
+        - potential_gamma: γ (shaping discount factor)
+        - exit_potential_mode: "canonical" | "non_canonical" | heuristic modes
+        - hold_potential_enabled: enable/disable hold potential computation
+        - entry_additive_enabled, exit_additive_enabled: enable non-PBRS additives
+        - hold_potential_scale, hold_potential_gain, transforms, etc.
+    is_exit : bool, optional
+        True if this is an exit transition (Long/Short → Neutral)
+    is_entry : bool, optional
+        True if this is an entry transition (Neutral → Long/Short)
+    prev_potential : float
+        Φ(s) - potential at current state s (must be passed explicitly)
  
      Returns
      -------
-    tuple[float, float, float, float, float, float]
-        (reward, reward_shaping, next_potential, pbrs_delta, entry_additive, exit_additive)
-        where pbrs_delta = gamma * next_potential - prev_potential is the pure PBRS component.
+    tuple[float, float, float, float, float]
+        (reward_shaping, next_potential, pbrs_delta, entry_additive, exit_additive)
+
+        - reward_shaping: Δ(s,a,s') = γ·Φ(s') - Φ(s), the PBRS shaping term
+        - next_potential: Φ(s') for next step (caller must store this)
+        - pbrs_delta: same as reward_shaping (diagnostic/compatibility)
+        - entry_additive: optional non-PBRS entry bonus (0.0 if disabled or not entry)
+        - exit_additive: optional non-PBRS exit bonus (0.0 if disabled or not exit)
  
      Notes
      -----
-    - Shaping Δ = γ·Φ(next) − Φ(prev).
-    - Φ(prev) must be provided explicitly as the stored potential carried across steps.
-      This uses an explicit stored-potential value across steps.
-    - Exit potential modes compute Φ(next) from Φ(prev).
-    - Entry additive is applied only on entry transitions (based on next_* metrics).
-    - Exit additive is applied only on exit transitions (based on current_* metrics).
-
-    Note
-    ----------------------
-    Canonical mode is typically evaluated with additives disabled externally.
-    This helper intentionally does not mutate `params`.
+    **State Management:**
+    - Caller is responsible for storing Φ(s') (returned as next_potential)
+    - No internal state; pure function
+
+    **Configuration:**
+    - All parameters read from params dict
+    - Use DEFAULT_MODEL_REWARD_PARAMETERS for defaults
+
+    **Recommendations:**
+    - Use canonical mode for policy-invariant shaping
+    - Monitor Σ_t γ^t·Δ_t ≈ 0 per episode in canonical mode
+    - Disable additives to preserve theoretical PBRS guarantees
+
+    **Validation:**
+    - Returns (0,0,0,0,0) if any output is non-finite
+    - Transform bounds ensure |Φ| ≤ scale
+
+    See Also
+    --------
+    ReforceXY._compute_pbrs_components : Stateful wrapper for RL environment
+    apply_potential_shaping : Deprecated wrapper that adds base_reward
      """
      gamma = _get_potential_gamma(params)
  
@@ -3314,11 +3477,9 @@ def apply_potential_shaping(
  
      if is_exit:
          next_potential = _compute_exit_potential(prev_potential, params)
-        # PBRS shaping Δ = γ·Φ(next) − Φ(prev)
          pbrs_delta = gamma * next_potential - prev_potential
          reward_shaping = pbrs_delta
      else:
-        # When hold potential is disabled, force Φ(next)=0 and emit no PBRS shaping on entry/hold.
          if not hold_potential_enabled:
              next_potential = 0.0
              pbrs_delta = 0.0
@@ -3327,11 +3488,9 @@ def apply_potential_shaping(
              next_potential = _compute_hold_potential(
                  next_pnl, pnl_target, next_duration_ratio, params
              )
-            # PBRS shaping Δ = γ·Φ(next) − Φ(prev)
              pbrs_delta = gamma * next_potential - prev_potential
              reward_shaping = pbrs_delta
  
-    # Non-PBRS additives
      if canonical_mode:
          entry_additive = 0.0
          exit_additive = 0.0
@@ -3340,11 +3499,70 @@ def apply_potential_shaping(
          cand_exit_add = _compute_exit_additive(
              current_pnl, pnl_target, current_duration_ratio, params
          )
-
          entry_additive = cand_entry_add if is_entry else 0.0
          exit_additive = cand_exit_add if is_exit else 0.0
  
-    reward = base_reward + reward_shaping + entry_additive + exit_additive
+    if not (
+        np.isfinite(reward_shaping)
+        and np.isfinite(next_potential)
+        and np.isfinite(pbrs_delta)
+        and np.isfinite(entry_additive)
+        and np.isfinite(exit_additive)
+    ):
+        return 0.0, 0.0, 0.0, 0.0, 0.0
+
+    return (
+        float(reward_shaping),
+        float(next_potential),
+        float(pbrs_delta),
+        float(entry_additive),
+        float(exit_additive),
+    )
+
+
+def apply_potential_shaping(
+    base_reward: float,
+    current_pnl: float,
+    pnl_target: float,
+    current_duration_ratio: float,
+    next_pnl: float,
+    next_duration_ratio: float,
+    params: RewardParams,
+    *,
+    is_exit: bool = False,
+    is_entry: bool = False,
+    prev_potential: float,
+) -> tuple[float, float, float, float, float, float]:
+    """Compute shaped reward and PBRS diagnostics.
+
+    .. deprecated::
+        This function exists only for backward compatibility with existing tests.
+        New code should use :func:`compute_pbrs_components` and compute the total reward manually.
+
+    This is a thin wrapper around `compute_pbrs_components()` that adds PBRS and
+    optional additive terms to the provided `base_reward`.
+
+    Returns
+    -------
+    tuple[float, float, float, float, float, float]
+        (reward, reward_shaping, next_potential, pbrs_delta, entry_additive, exit_additive)
+    """
+
+    reward_shaping, next_potential, pbrs_delta, entry_additive, exit_additive = (
+        compute_pbrs_components(
+            current_pnl,
+            pnl_target,
+            current_duration_ratio,
+            next_pnl,
+            next_duration_ratio,
+            params,
+            is_exit=is_exit,
+            is_entry=is_entry,
+            prev_potential=prev_potential,
+        )
+    )
+
+    reward = float(base_reward) + reward_shaping + entry_additive + exit_additive
      if not np.isfinite(reward):
          return float(base_reward), 0.0, 0.0, 0.0, 0.0, 0.0
  
diff --git a/ReforceXY/reward_space_analysis/tests/api/test_api_helpers.py b/ReforceXY/reward_space_analysis/tests/api/test_api_helpers.py

index d0f17bcb0a795898c57efe5af3dbaa1450f6858b..625d77621a7e84e36da2e95cbc0bfac5be33ebde 100644 (file)
--- a/ReforceXY/reward_space_analysis/tests/api/test_api_helpers.py
+++ b/ReforceXY/reward_space_analysis/tests/api/test_api_helpers.py
@@ -52,7 +52,7 @@ class TestAPIAndHelpers(RewardSpaceTestBase):
              draws = 2000
              entries = 0
              for _ in range(draws):
-                action = _sample_action(
+                action, _, _, _ = _sample_action(
                      Positions.Neutral,
                      rng,
                      short_allowed=short_allowed,
diff --git a/ReforceXY/reward_space_analysis/tests/components/test_additives.py b/ReforceXY/reward_space_analysis/tests/components/test_additives.py

index ae16fed587d068d3517d66afc11e39d9b7179fdf..4f91f4390d01875d3560c753df9aab9847208bf4 100644 (file)
--- a/ReforceXY/reward_space_analysis/tests/components/test_additives.py
+++ b/ReforceXY/reward_space_analysis/tests/components/test_additives.py
@@ -8,7 +8,7 @@ import unittest
  
  import pytest
  
-from reward_space_analysis import apply_potential_shaping
+from reward_space_analysis import compute_pbrs_components
  
  from ..constants import PARAMS
  from ..test_base import RewardSpaceTestBase
@@ -61,8 +61,8 @@ class TestAdditivesDeterministicContribution(RewardSpaceTestBase):
                  "exit_additive_gain": 1.0,
              }
          )
+        base_reward = 0.05
          ctx = {
-            "base_reward": 0.05,
              "current_pnl": 0.01,
              "pnl_target": PARAMS.PROFIT_AIM * PARAMS.RISK_REWARD_RATIO,
              "current_duration_ratio": 0.2,
@@ -71,16 +71,18 @@ class TestAdditivesDeterministicContribution(RewardSpaceTestBase):
              "is_entry": True,
              "is_exit": False,
          }
-        _t0, s0, _n0, _pbrs0, _entry0, _exit0 = apply_potential_shaping(
+        s0, _n0, _pbrs0, _entry0, _exit0 = compute_pbrs_components(
              prev_potential=0.0, params=base, **ctx
          )
-        t1, s1, _n1, _pbrs1, _entry1, _exit1 = apply_potential_shaping(
+        t0 = base_reward + s0 + _entry0 + _exit0
+        s1, _n1, _pbrs1, _entry1, _exit1 = compute_pbrs_components(
              prev_potential=0.0, params=with_add, **ctx
          )
+        t1 = base_reward + s1 + _entry1 + _exit1
          self.assertFinite(t1)
          self.assertFinite(s1)
          self.assertLess(abs(s1 - s0), 0.2)
-        self.assertGreater(t1 - _t0, 0.0, "Total reward should increase with additives present")
+        self.assertGreater(t1 - t0, 0.0, "Total reward should increase with additives present")
  
  
  if __name__ == "__main__":
diff --git a/ReforceXY/reward_space_analysis/tests/components/test_reward_components.py b/ReforceXY/reward_space_analysis/tests/components/test_reward_components.py

index 18a793050ec8756a6636126725546e759d0fdc3d..84d54efabd4c9120d6d628210d104365b750f309 100644 (file)
--- a/ReforceXY/reward_space_analysis/tests/components/test_reward_components.py
+++ b/ReforceXY/reward_space_analysis/tests/components/test_reward_components.py
@@ -514,8 +514,9 @@ class TestRewardComponents(RewardSpaceTestBase):
  
          idle_penalty_scale = _get_float_param(params, "idle_penalty_scale", 0.5)
          idle_penalty_power = _get_float_param(params, "idle_penalty_power", 1.025)
-        factor = _get_float_param(params, "base_factor", float(base_factor))
-        idle_factor = factor * (profit_aim / 4.0)
+        base_factor = _get_float_param(params, "base_factor", float(base_factor))
+        risk_reward_ratio = _get_float_param(params, "risk_reward_ratio", float(risk_reward_ratio))
+        idle_factor = base_factor * (profit_aim / risk_reward_ratio) / 4.0
          observed_ratio = abs(br_mid.idle_penalty) / (idle_factor * idle_penalty_scale)
          if observed_ratio > 0:
              implied_max_idle_duration_candles = 120 / observed_ratio ** (1 / idle_penalty_power)
diff --git a/ReforceXY/user_data/freqaimodels/ReforceXY.py b/ReforceXY/user_data/freqaimodels/ReforceXY.py

index 7ba323d3b2e5c06f5c4aefc4f79f9a628b75dcf2..3d1dc16b3e9016eb3d73001e5f8ae4260e5d0262 100644 (file)
--- a/ReforceXY/user_data/freqaimodels/ReforceXY.py
+++ b/ReforceXY/user_data/freqaimodels/ReforceXY.py
@@ -2020,7 +2020,7 @@ class MyRLEnv(Base5ActionRLEnv):
      ) -> float:
          """Compute PBRS potential Φ(s) for position holding states.
  
-        See ``_apply_potential_shaping`` for complete PBRS documentation.
+        See ``_compute_pbrs_components`` for PBRS documentation.
          """
          return self._compute_pnl_duration_signal(
              enabled=self._hold_potential_enabled,
@@ -2043,7 +2043,7 @@ class MyRLEnv(Base5ActionRLEnv):
      ) -> float:
          """Compute exit additive reward for position exit transitions.
  
-        See ``_apply_potential_shaping`` for complete PBRS documentation.
+        See ``_compute_pbrs_components`` for PBRS documentation.
          """
          return self._compute_pnl_duration_signal(
              enabled=self._exit_additive_enabled,
@@ -2066,7 +2066,7 @@ class MyRLEnv(Base5ActionRLEnv):
      ) -> float:
          """Compute entry additive reward for position entry transitions.
  
-        See ``_apply_potential_shaping`` for complete PBRS documentation.
+        See ``_compute_pbrs_components`` for PBRS documentation.
          """
          return self._compute_pnl_duration_signal(
              enabled=self._entry_additive_enabled,
@@ -2138,7 +2138,7 @@ class MyRLEnv(Base5ActionRLEnv):
      def _compute_exit_potential(self, prev_potential: float, gamma: float) -> float:
          """Compute next potential Φ(s') for exit transitions based on exit potential mode.
  
-        See ``_apply_potential_shaping`` for complete PBRS documentation.
+        See ``_compute_pbrs_components`` for PBRS documentation.
          """
          mode = self._exit_potential_mode
          # "canonical" or "non_canonical"
@@ -2201,137 +2201,196 @@ class MyRLEnv(Base5ActionRLEnv):
          """
          return hold_potential_enabled and not add_state_info
  
-    def _apply_potential_shaping(
+    def _compute_pbrs_components(
          self,
-        base_reward: float,
+        *,
          action: int,
          trade_duration: float,
          max_trade_duration: float,
          pnl: float,
          pnl_target: float,
-    ) -> float:
-        """Apply potential-based reward shaping (PBRS) (Ng et al. 1999).
+    ) -> tuple[float, float, float]:
+        """Compute potential-based reward shaping (PBRS) components.
+
+        This method computes the PBRS shaping terms without combining them with the base reward,
+        allowing the caller to construct the total reward as R'(s,a,s') = R(s,a,s') + Δ(s,a,s') + additives.
+
+        Canonical PBRS Formula
+        ----------------------
+        R'(s,a,s') = R(s,a,s') + γ·Φ(s') - Φ(s)
  
-        Canonical formula:  R'(s,a,s') = R_base(s,a,s') + γ Φ(s') − Φ(s)
+        where:
+            Δ(s,a,s') = γ·Φ(s') - Φ(s)  (PBRS shaping term)
  
          Notation
          --------
-        R_base: base reward; Φ(s)/Φ(s'): potentials (prev/next); γ: shaping discount;
-        Δ(s,s') = γΦ(s') − Φ(s); R' = R_base + Δ + optional additives; pnl_ratio = pnl/pnl_target;
-        duration_ratio = trade_duration / max_trade_duration (clamped to [0,1]).
+        **States & Actions:**
+            s     : current state
+            s'    : next state
+            a     : action
+
+        **Reward Components:**
+            R(s,a,s')     : base reward
+            R'(s,a,s')    : shaped reward
+            Δ(s,a,s')     : PBRS shaping term = γ·Φ(s') - Φ(s)
+
+        **Potential Function:**
+            Φ(s)          : potential at state s
+            γ             : discount factor for shaping (gamma)
+
+        **State Variables:**
+            r_pnl         : pnl / pnl_target (PnL ratio)
+            r_dur         : duration / max_duration (duration ratio, clamp [0,1])
+            g             : gain parameter
+            T_x           : transform function (tanh, softsign, etc.)
+
+        **Potential Formula:**
+            Φ(s) = scale · 0.5 · [T_pnl(g·r_pnl) + sgn(r_pnl)·T_dur(g·r_dur)]
  
          PBRS Theory & Compliance
          ------------------------
-        - Ng et al. 1999 (potential-based shaping invariance)
-        - Wiewiora et al. 2003 (Φ(terminal)=0 handling)
-        - Invariance holds only in canonical mode with additives disabled.
+        - Ng et al. 1999: potential-based shaping preserves optimal policy
+        - Wiewiora et al. 2003: terminal states must have Φ(terminal) = 0
+        - Invariance holds ONLY in canonical mode with additives disabled
+        - Theorem: Canonical + no additives ⇒ Σ_t γ^t·Δ_t = 0 over episodes
  
          Architecture & Transitions
          --------------------------
-        Three mutually exclusive transition types:
+        **Three mutually exclusive transition types:**
  
          1. **Entry** (Neutral → Long/Short):
-           - Initialize potential Φ for next step: Φ(s') = hold_potential(next_state)
-           - PBRS shaping reward: γΦ(s') - Φ(s) where Φ(s)=0 (neutral has no potential)
-           - Optional entry additive (non-PBRS additive term, breaks invariance if used)
+           - Φ(s) = 0 (neutral state has no potential)
+           - Φ(s') = hold_potential(s')
+           - Δ(s,a,s') = γ·Φ(s') - 0 = γ·Φ(s')
+           - Optional entry additive (breaks invariance)
  
          2. **Hold** (Long/Short → Long/Short):
-           - Standard PBRS: γΦ(s') - Φ(s) where both potentials computed from hold_potential()
-           - Φ(s') accounts for updated PnL and trade duration progression
+           - Φ(s) = hold_potential(s)
+           - Φ(s') = hold_potential(s')
+           - Δ(s,a,s') = γ·Φ(s') - Φ(s)
+           - Φ(s') reflects updated PnL and duration
  
          3. **Exit** (Long/Short → Neutral):
-           - **Canonical mode**: Φ(terminal)=0, Δ(s,s') = -Φ(s)
-           - **Heuristic modes**: Φ(s') computed by _compute_exit_potential(), Δ(s,s') = γΦ(s')-Φ(s)
-           - Optional exit additive (non-PBRS additive term for trade quality summary)
-
-        Potential Function Φ(s)
-        -----------------------
-        Φ(s) = scale * 0.5 * [T_pnl(g * pnl_ratio) + sign(pnl_ratio) * T_dur(g * duration_ratio)]
-        Transforms (bounded in [-1,1]): tanh, softsign, arctan, sigmoid (≈ tanh(0.5x)), asinh, clip.
-        Parameters: gain g (sharpens/softens), scale.
+           - Φ(s) = hold_potential(s)
+           - Φ(s') depends on exit_potential_mode:
+             * **canonical**: Φ(s') = 0 → Δ = -Φ(s)
+             * **heuristic**: Φ(s') = f(Φ(s)) → Δ = γ·Φ(s') - Φ(s)
+           - Optional exit additive (breaks invariance)
  
          Exit Potential Modes
          --------------------
          **canonical** (PBRS-compliant):
-        - Φ(s')=0 for all exit transitions
-        - Maintains theoretical invariance guarantees
-        - Shaping reward: γ·0-Φ(s) = -Φ(s)
-        - Entry/exit additives automatically disabled to preserve invariance
+            Φ(s') = 0
+            Δ = γ·0 - Φ(s) = -Φ(s)
+            Additives disabled automatically
  
          **non_canonical**:
-        - Φ(s')=0 for all exit transitions
-        - Entry/exit additives are allowed
+            Φ(s') = 0
+            Δ = -Φ(s)
+            Additives allowed (breaks invariance)
  
          **progressive_release** (heuristic):
-        - Φ(s')=Φ(s)*(1-decay_factor), gradual decay
-        - Shaping reward: γΦ(s')-Φ(s) = γΦ(s)*(1-d)-Φ(s)
+            Φ(s') = Φ(s)·(1 - d)  where d = decay_factor
+            Δ = γ·Φ(s)·(1-d) - Φ(s)
  
          **spike_cancel** (heuristic):
-        - Φ(s')=Φ(s)/γ (γ>0 finite)
-        - Shaping reward: γΦ(s')-Φ(s) = γ*(Φ(s)/γ)-Φ(s) = 0
+            Φ(s') = Φ(s)/γ
+            Δ = γ·(Φ(s)/γ) - Φ(s) = 0
  
          **retain_previous** (heuristic):
-        - Φ(s')=Φ(s), full retention
-        - Shaping reward: (γ-1)Φ(s)
-
-        Additive Components & Path Dependence
-        ------------------------------------
-        **Entry/Exit Additive Terms**: Non-PBRS additive rewards that break invariance
-        - Entry additive: Applied at entry transitions, computed via _compute_entry_additive()
-        - Exit additive: Applied at exit transitions, computed via _compute_exit_additive()
-        - Neither additive persists in stored potential (maintains neutrality)
+            Φ(s') = Φ(s)
+            Δ = γ·Φ(s) - Φ(s) = (γ-1)·Φ(s)
  
-        **Path Dependence**: Only canonical preserves invariance; others introduce path dependence.
+        Additive Terms (Non-PBRS)
+        --------------------------
+        Entry and exit additives are **optional bonuses** that break PBRS invariance:
+        - Entry additive: applied on Neutral→Long/Short transitions
+        - Exit additive: applied on Long/Short→Neutral transitions
+        - These do NOT persist in Φ(s) storage
  
          Invariance & Validation
          -----------------------
-        **Theoretical Guarantee**: Canonical + no additives ⇒ Σ_t γ^t Δ_t = 0 (Φ(start)=Φ(end)=0).
-
-        **Deviations from Theory**:
-        - Heuristic exit modes violate invariance
-        - Entry/exit additives break policy invariance
-        - Non-canonical modes may cause path-dependent learning
-
-        **Robustness**:
-        - Bounded transforms prevent potential explosion
-        - Finite value validation with fallback to 0
-        - Terminal state enforcement: Φ(s)=0 when terminated=True
-        - All transform functions are strictly bounded in [-1, 1], ensuring numerical stability
-        - Bounds: |Φ(s)| ≤ scale ; |Δ(s,s')| ≤ (1+γ)*scale
+        **Theoretical Guarantee:**
+            Canonical + no additives ⇒ Σ_t γ^t·Δ_t = 0
+            (Φ(start) = Φ(end) = 0)
+
+        **Deviations from Theory:**
+            - Heuristic exit modes violate invariance
+            - Entry/exit additives break policy invariance
+            - Non-canonical modes introduce path dependence
+
+        **Robustness:**
+            - All transforms bounded: |T_x| ≤ 1
+            - Validation: |Φ(s)| ≤ scale
+            - Bounds: |Δ(s,a,s')| ≤ (1+γ)·scale
+            - Terminal enforcement: Φ(s) = 0 when terminated
+
+        Implementation Details
+        ----------------------
+        This method wraps the core PBRS logic for use in the RL environment:
+        - Reads Φ(s) from self._last_potential (previous state potential)
+        - Reads γ from self._potential_gamma
+        - Reads configuration from self._exit_potential_mode, self._entry_additive_enabled, etc.
+        - Computes next_position, next_duration_ratio, is_entry, is_exit internally
+        - Stores Φ(s') to self._last_potential for next step
+        - Updates diagnostic accumulators (_total_reward_shaping, _total_entry_additive, etc.)
  
          Parameters
          ----------
-        base_reward : float
-            Original reward before shaping
          action : int
-            Action taken leading to transition
+            Action taken: determines transition type (entry/hold/exit)
          trade_duration : float
-            Current trade duration in candles
+            Current trade duration in candles (for current state s)
          max_trade_duration : float
-            Maximum allowed trade duration
+            Maximum allowed trade duration (for normalization)
          pnl : float
-            Current position PnL
+            Current position PnL (for current state s)
          pnl_target : float
-            Target PnL for normalization
+            Target PnL for ratio normalization: r_pnl = pnl / pnl_target
  
          Returns
          -------
-        float
-            Shaped reward R'(s,a,s') = R_base + Δ(s,s') + optional_additives
+        tuple[float, float, float]
+            (reward_shaping, entry_additive, exit_additive)
+
+            - reward_shaping: Δ(s,a,s') = γ·Φ(s') - Φ(s), the PBRS shaping term
+            - entry_additive: optional non-PBRS entry bonus (0.0 if disabled or not entry)
+            - exit_additive: optional non-PBRS exit bonus (0.0 if disabled or not exit)
  
          Notes
          -----
-        - Canonical mode recommended for invariance
-        - Monitor discounted Σ γ^t Δ_t (≈0 per episode canonical)
-        - Heuristic exit modes may affect convergence
-        - Transform validation delegated to analysis tools
-        - Φ reset at exits (canonical) enables telescoping cancellation
+        **State Management:**
+        - Current Φ(s): read from self._last_potential
+        - Next Φ(s'): computed and stored to self._last_potential
+        - Transition type: inferred from self._position and action
+
+        **Configuration Sources:**
+        - γ: self._potential_gamma
+        - Exit mode: self._exit_potential_mode
+        - Additives: self._entry_additive_enabled, self._exit_additive_enabled
+        - Transforms: self._hold_potential_transform_pnl, etc.
+
+        **Recommendations:**
+        - Use canonical mode for policy-invariant shaping
+        - Monitor Σ_t γ^t·Δ_t ≈ 0 per episode in canonical mode
+        - Disable additives to preserve theoretical PBRS guarantees
+
+        See Also
+        --------
+        reward_space_analysis.compute_pbrs_components : Stateless version for analysis
          """
+        prev_potential = float(self._last_potential)
+
          if not self._hold_potential_enabled and not (
              self._entry_additive_enabled or self._exit_additive_enabled
          ):
-            return base_reward
-        prev_potential = self._last_potential
+            self._last_prev_potential = float(prev_potential)
+            self._last_next_potential = float(prev_potential)
+            self._last_entry_additive = 0.0
+            self._last_exit_additive = 0.0
+            self._last_reward_shaping = 0.0
+            return 0.0, 0.0, 0.0
+
          next_position, next_trade_duration, next_pnl = self._get_next_transition_state(
              action=action, trade_duration=trade_duration, pnl=pnl
          )
@@ -2354,49 +2413,34 @@ class MyRLEnv(Base5ActionRLEnv):
          ) and next_position in (Positions.Long, Positions.Short)
  
          gamma = self._potential_gamma
-        if is_entry:
+
+        reward_shaping = 0.0
+        entry_additive = 0.0
+        exit_additive = 0.0
+        next_potential = prev_potential
+
+        if is_entry or is_hold:
              if self._hold_potential_enabled:
-                potential = self._compute_hold_potential(
+                next_potential = self._compute_hold_potential(
                      next_position, next_duration_ratio, next_pnl, pnl_target
                  )
-                reward_shaping = gamma * potential - prev_potential
-                self._last_potential = potential
+                reward_shaping = gamma * next_potential - prev_potential
              else:
+                next_potential = 0.0
                  reward_shaping = 0.0
-                self._last_potential = 0.0
-            self._last_exit_additive = 0.0
-            self._last_entry_additive = 0.0
-            entry_additive = 0.0
-            if self._entry_additive_enabled and not self.is_pbrs_invariant_mode():
+
+            if (
+                is_entry
+                and self._entry_additive_enabled
+                and not self.is_pbrs_invariant_mode()
+            ):
                  entry_additive = self._compute_entry_additive(
                      pnl=next_pnl,
                      pnl_target=pnl_target,
                      duration_ratio=next_duration_ratio,
                  )
-                self._last_entry_additive = float(entry_additive)
                  self._total_entry_additive += float(entry_additive)
-            self._last_reward_shaping = float(reward_shaping)
-            self._total_reward_shaping += float(reward_shaping)
-            self._last_prev_potential = float(prev_potential)
-            self._last_next_potential = float(self._last_potential)
-            return base_reward + reward_shaping + entry_additive
-        elif is_hold:
-            if self._hold_potential_enabled:
-                potential = self._compute_hold_potential(
-                    next_position, next_duration_ratio, next_pnl, pnl_target
-                )
-                reward_shaping = gamma * potential - prev_potential
-                self._last_potential = potential
-            else:
-                reward_shaping = 0.0
-                self._last_potential = 0.0
-            self._last_entry_additive = 0.0
-            self._last_exit_additive = 0.0
-            self._last_reward_shaping = float(reward_shaping)
-            self._total_reward_shaping += float(reward_shaping)
-            self._last_prev_potential = float(prev_potential)
-            self._last_next_potential = float(self._last_potential)
-            return base_reward + reward_shaping
+
          elif is_exit:
              if (
                  self._exit_potential_mode
@@ -2406,34 +2450,32 @@ class MyRLEnv(Base5ActionRLEnv):
                  == ReforceXY._EXIT_POTENTIAL_MODES[1]  # "non_canonical"
              ):
                  next_potential = 0.0
-                exit_reward_shaping = -prev_potential
+                reward_shaping = -prev_potential
              else:
                  next_potential = self._compute_exit_potential(prev_potential, gamma)
-                exit_reward_shaping = gamma * next_potential - prev_potential
-            self._last_entry_additive = 0.0
-            self._last_exit_additive = 0.0
-            exit_additive = 0.0
+                reward_shaping = gamma * next_potential - prev_potential
+
              if self._exit_additive_enabled and not self.is_pbrs_invariant_mode():
                  duration_ratio = trade_duration / max(max_trade_duration, 1)
                  exit_additive = self._compute_exit_additive(
                      pnl, pnl_target, duration_ratio
                  )
-                self._last_exit_additive = float(exit_additive)
                  self._total_exit_additive += float(exit_additive)
-            self._last_potential = next_potential
-            self._last_reward_shaping = float(exit_reward_shaping)
-            self._total_reward_shaping += float(exit_reward_shaping)
-            self._last_prev_potential = float(prev_potential)
-            self._last_next_potential = float(self._last_potential)
-            return base_reward + exit_reward_shaping + exit_additive
+
          else:
              # Neutral self-loop
-            self._last_prev_potential = float(prev_potential)
-            self._last_next_potential = float(self._last_potential)
-            self._last_entry_additive = 0.0
-            self._last_exit_additive = 0.0
-            self._last_reward_shaping = 0.0
-            return base_reward
+            next_potential = prev_potential
+            reward_shaping = 0.0
+
+        self._last_potential = float(next_potential)
+        self._last_prev_potential = float(prev_potential)
+        self._last_next_potential = float(self._last_potential)
+        self._last_entry_additive = float(entry_additive)
+        self._last_exit_additive = float(exit_additive)
+        self._last_reward_shaping = float(reward_shaping)
+        self._total_reward_shaping += float(reward_shaping)
+
+        return float(reward_shaping), float(entry_additive), float(exit_additive)
  
      def _set_observation_space(self) -> None:
          """
@@ -2755,7 +2797,7 @@ class MyRLEnv(Base5ActionRLEnv):
              3. Hold overtime penalty
              4. Exit reward
              5. Default fallback (0.0 if no specific reward)
-            6. PBRS application: R'(s,a,s') = R_base + Δ(s,s') + optional_additives
+            6. PBRS computation and application: R'(s,a,s') = R_base + Δ(s,a,s') + optional_additives
  
          The final shaped reward is what the RL agent receives for learning.
          In canonical PBRS mode, the learned policy is theoretically equivalent
@@ -2769,7 +2811,7 @@ class MyRLEnv(Base5ActionRLEnv):
          Returns
          -------
          float
-            Shaped reward R'(s,a,s') = R_base + Δ(s,s') + optional_additives
+            Shaped reward R'(s,a,s') = R_base + Δ(s,a,s') + optional_additives
          """
          model_reward_parameters = self.rl_config.get("model_reward_parameters", {})
          base_reward: Optional[float] = None
@@ -2795,7 +2837,7 @@ class MyRLEnv(Base5ActionRLEnv):
          base_factor = float(
              model_reward_parameters.get("base_factor", ReforceXY.DEFAULT_BASE_FACTOR)
          )
-        idle_factor = base_factor * (self.profit_aim / 4.0)
+        idle_factor = base_factor * (self.profit_aim / self.rr) / 4.0
          hold_factor = idle_factor
  
          # 2. Idle penalty
@@ -2876,8 +2918,7 @@ class MyRLEnv(Base5ActionRLEnv):
              base_reward = 0.0
  
          # 6. Potential-based reward shaping
-        return self._apply_potential_shaping(
-            base_reward=base_reward,
+        reward_shaping, entry_additive, exit_additive = self._compute_pbrs_components(
              action=action,
              trade_duration=trade_duration,
              max_trade_duration=max_trade_duration,
@@ -2885,6 +2926,8 @@ class MyRLEnv(Base5ActionRLEnv):
              pnl_target=self._pnl_target,
          )
  
+        return base_reward + reward_shaping + entry_additive + exit_additive
+
      def _get_observation(self) -> NDArray[np.float32]:
          start_idx = max(self._start_tick, self._current_tick - self.window_size)
          end_idx = min(self._current_tick, len(self.signal_features))
author	Jérôme Benoit <jerome.benoit@piment-noir.org>
	Tue, 23 Dec 2025 19:13:18 +0000 (20:13 +0100)
committer	Jérôme Benoit <jerome.benoit@piment-noir.org>
	Tue, 23 Dec 2025 19:13:18 +0000 (20:13 +0100)
ReforceXY/reward_space_analysis/reward_space_analysis.py		patch \| blob \| blame \| history
ReforceXY/reward_space_analysis/tests/api/test_api_helpers.py		patch \| blob \| blame \| history
ReforceXY/reward_space_analysis/tests/components/test_additives.py		patch \| blob \| blame \| history
ReforceXY/reward_space_analysis/tests/components/test_reward_components.py		patch \| blob \| blame \| history
ReforceXY/user_data/freqaimodels/ReforceXY.py		patch \| blob \| blame \| history