From ba7f4136c41998321a223b62c5ea86aa2e386931 Mon Sep 17 00:00:00 2001
From: =?utf8?q?J=C3=A9r=C3=B4me=20Benoit?= <jerome.benoit@piment-noir.org>
Date: Thu, 16 Oct 2025 16:12:16 +0200
Subject: [PATCH] docs(reforcexy): incorrect definition of PBRS invariance
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

Signed-off-by: JÃ©rÃ´me Benoit <jerome.benoit@piment-noir.org>
---
 ReforceXY/user_data/freqaimodels/ReforceXY.py | 64 +++++++------------
 1 file changed, 24 insertions(+), 40 deletions(-)

diff --git a/ReforceXY/user_data/freqaimodels/ReforceXY.py b/ReforceXY/user_data/freqaimodels/ReforceXY.py
index 08b4ac4..cff90f5 100644
--- a/ReforceXY/user_data/freqaimodels/ReforceXY.py
+++ b/ReforceXY/user_data/freqaimodels/ReforceXY.py
@@ -1763,29 +1763,21 @@ class MyRLEnv(Base5ActionRLEnv):
         pnl: float,
         pnl_target: float,
     ) -> float:
-        """Apply potential-based reward shaping (PBRS) following Ng et al. 1999.
+        """Apply potential-based reward shaping (PBRS) (Ng et al. 1999).
 
-        Implements the canonical PBRS formula:
-
-            R'(s, a, s') = R_base(s, a, s') + Î³ Î¦(s') - Î¦(s)
+        Canonical formula:  R'(s,a,s') = R_base(s,a,s') + Î³ Î¦(s') â Î¦(s)
 
         Notation
         --------
-        - R_base(s, a, s') : unshaped environment reward (code variable: ``base_reward``)
-        - Î¦(s)             : potential before transition (code: ``prev_potential`` / ``self._last_potential``)
-        - Î¦(s')            : potential after transition (computed per transition type)
-        - Î³                : shaping discount (``self._potential_gamma``)
-        - Î(s,s')          : shaping term = Î³ Î¦(s') - Î¦(s) (logged as ``shaping_reward`` per step)
-        - R'(s, a, s')     : shaped reward delivered to the agent = R_base + Î(s,s') + (additives if enabled)
-        - pnl_ratio        : pnl / pnl_target (normalized profit component before transform)
-        - duration_ratio   : trade_duration / max_trade_duration (clipped to [0,1] before transform)
+        R_base: base reward; Î¦(s)/Î¦(s'): potentials (prev/next); Î³: shaping discount;
+        Î(s,s') = Î³Î¦(s') â Î¦(s); R' = R_base + Î + optional additives; pnl_ratio = pnl/pnl_target;
+        duration_ratio = trade_duration / max_trade_duration (clamped to [0,1]).
 
         PBRS Theory & Compliance
         ------------------------
-        This implementation follows academic standards for potential-based reward shaping:
-        - Ng et al. 1999: Canonical formula with invariance guarantees
-        - Wiewiora et al. 2003: Terminal state handling (Î¦(terminal)=0)
-        - Maintains policy invariance in canonical mode with proper terminal handling
+        - Ng et al. 1999 (potential-based shaping invariance)
+        - Wiewiora et al. 2003 (Î¦(terminal)=0 handling)
+        - Invariance holds only in canonical mode with additives disabled.
 
         Architecture & Transitions
         --------------------------
@@ -1807,19 +1799,9 @@ class MyRLEnv(Base5ActionRLEnv):
 
         Potential Function Î¦(s)
         -----------------------
-        Hold potential formula: Î¦(s) = scale * 0.5 * [T_pnl(g*pnl_ratio) + T_dur(g*duration_ratio)]
-
-        **Bounded Transform Functions** (each maps R -> (-1, 1) except clip which is [-1, 1]):
-        - tanh: tanh(x)
-        - softsign: x / (1 + |x|)
-        - arctan: (2/pi) * arctan(x)
-        - sigmoid: 2Ï(x) - 1, Ï(x) = 1/(1 + e^(-x))
-        - asinh: x / sqrt(1 + x^2)
-        - clip: clip(x, -1, 1)
-
-        **Parameters**:
-        - gain g: sharpens (g>1) or softens (g<1) transform input
-        - scale: multiplies final potential value
+        Î¦(s) = scale * 0.5 * [T_pnl(g * pnl_ratio) + T_dur(g * duration_ratio)]
+        Transforms (bounded in [-1,1]): tanh, softsign, arctan, sigmoid (â tanh(0.5x)), asinh, clip.
+        Parameters: gain g (sharpens/softens), scale.
 
         Exit Potential Modes
         --------------------
@@ -1829,12 +1811,16 @@ class MyRLEnv(Base5ActionRLEnv):
         - Shaping reward: Î³Â·0-Î¦(s) = -Î¦(s)
         - Entry/exit additives automatically disabled to preserve invariance
 
+        **non_canonical**:
+        - Î¦(s')=0 for all exit transitions
+        - Entry/exit additives are allowed
+
         **progressive_release** (heuristic):
         - Î¦(s')=Î¦(s)*(1-decay_factor), gradual decay
         - Shaping reward: Î³Î¦(s')-Î¦(s) = Î³Î¦(s)*(1-d)-Î¦(s)
 
         **spike_cancel** (heuristic):
-        - Î¦(s')=Î¦(s)/Î³, aims for zero net shaping
+        - Î¦(s')=Î¦(s)/Î³ (Î³>0 finite)
         - Shaping reward: Î³Î¦(s')-Î¦(s) = Î³*(Î¦(s)/Î³)-Î¦(s) = 0
 
         **retain_previous** (heuristic):
@@ -1848,14 +1834,11 @@ class MyRLEnv(Base5ActionRLEnv):
         - Exit additive: Applied at exit transitions, computed via _compute_exit_additive()
         - Neither additive persists in stored potential (maintains neutrality)
 
-        **Path Dependence**: Only canonical mode preserves PBRS invariance. Heuristic
-        exit modes introduce path dependence through non-zero terminal potentials.
+        **Path Dependence**: Only canonical preserves invariance; others introduce path dependence.
 
         Invariance & Validation
         -----------------------
-        **Theoretical Guarantee**: In canonical mode, â Î(s,s') = 0 over
-        complete episodes due to Î¦(terminal)=0. Entry/exit additives are automatically
-        disabled in canonical mode to preserve this invariance.
+        **Theoretical Guarantee**: Canonical + no additives â Î£_t Î³^t Î_t = 0 (Î¦(start)=Î¦(end)=0).
 
         **Deviations from Theory**:
         - Heuristic exit modes violate invariance
@@ -1867,6 +1850,7 @@ class MyRLEnv(Base5ActionRLEnv):
         - Finite value validation with fallback to 0
         - Terminal state enforcement: Î¦(s)=0 when terminated=True
         - All transform functions are strictly bounded in [-1, 1], ensuring numerical stability
+        - Bounds: |Î¦(s)| â¤ scale ; |Î(s,s')| â¤ (1+Î³)*scale
 
         Parameters
         ----------
@@ -1890,11 +1874,11 @@ class MyRLEnv(Base5ActionRLEnv):
 
         Notes
         -----
-        - Use canonical mode for theoretical compliance
-        - Monitor âÎ(s,s') for invariance validation (should sum to 0 over episodes)
-        - Heuristic exit modes are experimental and may affect convergence
-        - Transform validation removed from runtime (deferred to analysis tools)
-        - In canonical exit mode, Î¦ is reset to 0 at exit boundaries, ensuring telescoping cancellation (âÎ=0) over closed episodes
+        - Canonical mode recommended for invariance
+        - Monitor discounted Î£ Î³^t Î_t (â0 per episode canonical)
+        - Heuristic exit modes may affect convergence
+        - Transform validation delegated to analysis tools
+        - Î¦ reset at exits (canonical) enables telescoping cancellation
         """
         if not self._hold_potential_enabled and not (
             self._entry_additive_enabled or self._exit_additive_enabled
-- 
2.43.0