From 851d61e190cea629861065fe309c9b5783845c8c Mon Sep 17 00:00:00 2001
From: =?utf8?q?J=C3=A9r=C3=B4me=20Benoit?= <jerome.benoit@piment-noir.org>
Date: Tue, 30 Dec 2025 01:30:11 +0100
Subject: [PATCH] docs(ReforceXY): clarify PBRS formulas in reward calculation
 and analysis
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

Signed-off-by: JÃ©rÃ´me Benoit <jerome.benoit@piment-noir.org>
---
 .../reward_space_analysis.py                  | 82 ++++++++++++++++++-
 ReforceXY/user_data/freqaimodels/ReforceXY.py | 16 +++-
 2 files changed, 92 insertions(+), 6 deletions(-)

diff --git a/ReforceXY/reward_space_analysis/reward_space_analysis.py b/ReforceXY/reward_space_analysis/reward_space_analysis.py
index e6daa92..22c318c 100644
--- a/ReforceXY/reward_space_analysis/reward_space_analysis.py
+++ b/ReforceXY/reward_space_analysis/reward_space_analysis.py
@@ -788,7 +788,7 @@ class RewardBreakdown:
     next_potential: float = 0.0
     # PBRS helpers
     base_reward: float = 0.0
-    pbrs_delta: float = 0.0  # Î(s,a,s') = Î³Â·Î¦(s') â Î¦(s)  # noqa: RUF003
+    pbrs_delta: float = 0.0  # Î(s,a,s') = Î³Â·Î¦(s') - Î¦(s)  # noqa: RUF003
     invariance_correction: float = 0.0
 
 
@@ -1173,6 +1173,60 @@ def calculate_reward(
     action_masking: bool,
     prev_potential: float = np.nan,
 ) -> RewardBreakdown:
+    """Calculate complete reward with base reward and PBRS shaping.
+
+    This function computes the full reward pipeline including base reward calculation,
+    PBRS (Potential-Based Reward Shaping), and optional additives.
+
+    Reward Formula
+    --------------
+    R'(s,a,s') = R(s,a,s') + Î(s,a,s') + entry_additive + exit_additive
+
+    where:
+        - R(s,a,s'): Base reward (invalid/idle/hold penalty or exit reward)
+        - Î(s,a,s'): PBRS delta term = Î³Â·Î¦(s') - Î¦(s)
+        - entry_additive: Optional entry bonus (disabled in canonical mode)
+        - exit_additive: Optional exit bonus (disabled in canonical mode)
+
+    Parameters
+    ----------
+    context : RewardContext
+        Current reward context (position, action, PnL, duration, etc.)
+    params : RewardParams
+        Reward parameter dictionary with configuration
+    base_factor : float
+        Base scaling factor for reward components
+    profit_aim : float
+        Target profit for normalization
+    risk_reward_ratio : float
+        Risk/reward ratio for trade evaluation
+    short_allowed : bool
+        Whether short positions are permitted
+    action_masking : bool
+        Whether to apply action masking (affects invalid action penalty)
+    prev_potential : float, optional
+        Previous state potential Î¦(s), by default np.nan
+
+    Returns
+    -------
+    RewardBreakdown
+        Complete breakdown of reward components including:
+        - total: Final shaped reward R'(s,a,s')
+        - base_reward: R(s,a,s')
+        - reward_shaping: Î(s,a,s')
+        - entry_additive: Entry bonus
+        - exit_additive: Exit bonus
+        - prev_potential: Î¦(s)
+        - next_potential: Î¦(s')
+        - pbrs_delta: Same as reward_shaping
+        - And component-specific values (invalid_penalty, idle_penalty, etc.)
+
+    Notes
+    -----
+    This is the reference implementation for the reward calculation used in testing
+    and analysis. It mirrors the logic in ReforceXY.calculate_reward() but returns
+    a detailed breakdown for diagnostic purposes.
+    """
     breakdown = RewardBreakdown()
 
     is_valid = _is_valid_action(
@@ -3196,8 +3250,12 @@ def compute_pbrs_components(
     ----------------------
     R'(s,a,s') = R(s,a,s') + Î(s,a,s')
 
+    Non Canonical PBRS Formula
+    --------------------------
+    R'(s,a,s') = R(s,a,s') + Î(s,a,s') + entry_additive + exit_additive
+
     where:
-        Î(s,a,s') = gamma * Phi(s') - Phi(s)  (PBRS shaping term)
+        Î(s,a,s') = Î³Â·Î¦(s') - Î¦(s)  (PBRS shaping term)
 
     Hold Potential Formula
     ----------------------
@@ -3211,6 +3269,22 @@ def compute_pbrs_components(
 
     Then:
         Î¦_hold(s) = scale Â· 0.5 Â· [T_pnl(gÂ·r_pnl) + sign(r_pnl)Â·m_durÂ·T_dur(gÂ·r_dur)]
+
+    Returns
+    -------
+    tuple[float, float, float, float, float]
+        (reward_shaping, next_potential, pbrs_delta, entry_additive, exit_additive)
+
+        - reward_shaping: Î(s,a,s') = Î³Â·Î¦(s') - Î¦(s), the PBRS shaping term
+        - next_potential: Î¦(s'), the potential function value for next state
+        - pbrs_delta: Same as reward_shaping (kept for backward compatibility)
+        - entry_additive: Optional non-PBRS entry bonus (0.0 if disabled or not entry)
+        - exit_additive: Optional non-PBRS exit bonus (0.0 if disabled or not exit)
+
+    Notes
+    -----
+    In canonical mode (exit_potential_mode='canonical'), entry_additive and exit_additive
+    are forced to 0.0 to preserve PBRS policy invariance.
     """
     gamma = _get_potential_gamma(params)
 
@@ -3935,10 +4009,10 @@ def write_complete_statistical_analysis(
                 f.write("|--------|-------|-------------|\n")
                 f.write(f"| Mean Base Reward | {mean_base:.6f} | Average reward before PBRS |\n")
                 f.write(f"| Std Base Reward | {std_base:.6f} | Variability of base reward |\n")
-                f.write(f"| Mean PBRS Delta | {mean_pbrs:.6f} | Average Î³Â·Î¦(s')âÎ¦(s) |\n")  # noqa: RUF001
+                f.write(f"| Mean PBRS Delta | {mean_pbrs:.6f} | Average Î³Â·Î¦(s') - Î¦(s) |\n")  # noqa: RUF001
                 f.write(f"| Std PBRS Delta | {std_pbrs:.6f} | Variability of PBRS delta |\n")
                 f.write(
-                    f"| Mean Invariance Correction | {mean_inv_corr:.6f} | Average reward_shaping â pbrs_delta |\n"  # noqa: RUF001
+                    f"| Mean Invariance Correction | {mean_inv_corr:.6f} | Average reward_shaping - pbrs_delta |\n"  # noqa: RUF001
                 )
                 f.write(
                     f"| Std Invariance Correction | {std_inv_corr:.6f} | Variability of correction |\n"
diff --git a/ReforceXY/user_data/freqaimodels/ReforceXY.py b/ReforceXY/user_data/freqaimodels/ReforceXY.py
index 4bdba9e..66c3ef6 100644
--- a/ReforceXY/user_data/freqaimodels/ReforceXY.py
+++ b/ReforceXY/user_data/freqaimodels/ReforceXY.py
@@ -2360,6 +2360,10 @@ class MyRLEnv(Base5ActionRLEnv):
         ----------------------
         R'(s,a,s') = R(s,a,s') + Î(s,a,s')
 
+        Non-Canonical PBRS Formula
+        --------------------------
+        R'(s,a,s') = R(s,a,s') + Î(s,a,s') + entry_additive + exit_additive
+
         where:
             Î(s,a,s') = Î³Â·Î¦(s') - Î¦(s)  (PBRS shaping term)
 
@@ -2966,7 +2970,7 @@ class MyRLEnv(Base5ActionRLEnv):
             3. Hold overtime penalty
             4. Exit reward
             5. Default fallback (0.0 if no specific reward)
-            6. PBRS computation and application: R'(s,a,s') = R_base + Î(s,a,s') + optional_additives
+            6. PBRS computation and application: R'(s,a,s') = R(s,a,s') + Î(s,a,s') + entry_additive + exit_additive
 
         The final shaped reward is what the RL agent receives for learning.
         In canonical PBRS mode, the learned policy is theoretically equivalent
@@ -2980,7 +2984,15 @@ class MyRLEnv(Base5ActionRLEnv):
         Returns
         -------
         float
-            Shaped reward R'(s,a,s') = R_base + Î(s,a,s') + optional_additives
+            Shaped reward R'(s,a,s') = R(s,a,s') + Î(s,a,s') + entry_additive + exit_additive
+
+            Implementation: base_reward + reward_shaping + entry_additive + exit_additive
+
+            where:
+            - R(s,a,s') / base_reward: Base reward (invalid/idle/hold penalty or exit reward)
+            - Î(s,a,s') / reward_shaping: PBRS delta term = Î³Â·Î¦(s') - Î¦(s)
+            - entry_additive: Optional entry bonus (breaks PBRS invariance)
+            - exit_additive: Optional exit bonus (breaks PBRS invariance)
         """
         model_reward_parameters = self.rl_config.get("model_reward_parameters", {})
         base_reward: Optional[float] = None
-- 
2.43.0