From: Jérôme Benoit <jerome.benoit@piment-noir.org>
Date: Thu, 12 Feb 2026 23:10:08 +0000 (+0100)
Subject: fix(ReforceXY): add context-aware guard for efficiency coefficient division
X-Git-Url: https://git.piment-noir.org/?a=commitdiff_plain;h=80d9ccc9fc2052acbdc32ebfe87c030064e167bb;p=freqai-strategies.git

fix(ReforceXY): add context-aware guard for efficiency coefficient division

Prevent division explosion in _compute_efficiency_coefficient() when
max_unrealized_profit ≈ min_unrealized_profit by requiring a minimum
meaningful range based on pnl_target. Also adds validation warnings
for potential_gamma=0 and pnl_target<=0 edge cases.
---

diff --git a/ReforceXY/reward_space_analysis/reward_space_analysis.py b/ReforceXY/reward_space_analysis/reward_space_analysis.py
index bef1074..2c61c2a 100644
--- a/ReforceXY/reward_space_analysis/reward_space_analysis.py
+++ b/ReforceXY/reward_space_analysis/reward_space_analysis.py
@@ -99,6 +99,8 @@ INTERNAL_GUARDS: dict[str, float] = {
     "sim_extreme_pnl_threshold": 0.2,
     "histogram_epsilon": 1e-10,
     "distribution_identity_epsilon": 1e-12,
+    "efficiency_min_range_epsilon": 1e-6,
+    "efficiency_min_range_fraction": 0.01,
 }
 
 # PBRS constants
@@ -943,7 +945,7 @@ def _get_exit_factor(
             pnl_target,
             risk_reward_ratio,
         )
-        * _compute_efficiency_coefficient(params, context, pnl)
+        * _compute_efficiency_coefficient(params, context, pnl, pnl_target)
     )
 
     if _get_bool_param(
@@ -1013,6 +1015,7 @@ def _compute_efficiency_coefficient(
     params: RewardParams,
     context: RewardContext,
     pnl: float,
+    pnl_target: float,
 ) -> float:
     """
     Compute exit efficiency coefficient based on PnL position relative to unrealized extremes.
@@ -1027,6 +1030,7 @@ def _compute_efficiency_coefficient(
             - efficiency_center: Target efficiency ratio (0.0-1.0)
         context: Trade context with unrealized profit/loss extremes
         pnl: Realized profit/loss
+        pnl_target: Target profit threshold for context-aware range validation
 
     Returns:
         float: Coefficient â¥ 0.0 (typically 0.5-1.5 range)
@@ -1038,7 +1042,11 @@ def _compute_efficiency_coefficient(
         max_pnl = max(context.max_unrealized_profit, pnl)
         min_pnl = min(context.min_unrealized_profit, pnl)
         range_pnl = max_pnl - min_pnl
-        if np.isfinite(range_pnl) and not np.isclose(range_pnl, 0.0):
+        # Guard against division explosion when max_pnl â min_pnl
+        eps = float(INTERNAL_GUARDS.get("efficiency_min_range_epsilon", 1e-6))
+        frac = float(INTERNAL_GUARDS.get("efficiency_min_range_fraction", 0.01))
+        min_meaningful_range = max(eps, frac * pnl_target)
+        if np.isfinite(range_pnl) and range_pnl >= min_meaningful_range:
             efficiency_ratio = (pnl - min_pnl) / range_pnl
             # For profits (pnl > 0): high ratio = good exit â higher coefficient â amplify gain
             # For losses (pnl < 0): high ratio = good exit â LOWER coefficient â attenuate penalty
diff --git a/ReforceXY/reward_space_analysis/tests/components/test_reward_components.py b/ReforceXY/reward_space_analysis/tests/components/test_reward_components.py
index 3004f75..c87bfb8 100644
--- a/ReforceXY/reward_space_analysis/tests/components/test_reward_components.py
+++ b/ReforceXY/reward_space_analysis/tests/components/test_reward_components.py
@@ -33,6 +33,8 @@ from ..helpers import (
 )
 from ..test_base import RewardSpaceTestBase
 
+_DEFAULT_PNL_TARGET = PARAMS.PROFIT_AIM * PARAMS.RISK_REWARD_RATIO
+
 pytestmark = pytest.mark.components
 
 
@@ -301,7 +303,9 @@ class TestRewardComponents(RewardSpaceTestBase):
             action=Actions.Long_exit,
         )
 
-        coefficient = _compute_efficiency_coefficient(params, ctx, ctx.current_pnl)
+        coefficient = _compute_efficiency_coefficient(
+            params, ctx, ctx.current_pnl, _DEFAULT_PNL_TARGET
+        )
 
         self.assertFinite(coefficient, name="efficiency_coefficient")
         self.assertAlmostEqualFloat(coefficient, 1.0, tolerance=TOLERANCE.GENERIC_EQ)
@@ -339,7 +343,9 @@ class TestRewardComponents(RewardSpaceTestBase):
                 position=Positions.Long,
                 action=Actions.Long_exit,
             )
-            coefficient = _compute_efficiency_coefficient(params, ctx, ctx.current_pnl)
+            coefficient = _compute_efficiency_coefficient(
+                params, ctx, ctx.current_pnl, _DEFAULT_PNL_TARGET
+            )
             self.assertFinite(coefficient, name=f"efficiency_coefficient[pnl={pnl}]")
             coefficients.append(coefficient)
 
@@ -420,7 +426,9 @@ class TestRewardComponents(RewardSpaceTestBase):
                 position=Positions.Long,
                 action=Actions.Long_exit,
             )
-            coefficient = _compute_efficiency_coefficient(params, ctx, ctx.current_pnl)
+            coefficient = _compute_efficiency_coefficient(
+                params, ctx, ctx.current_pnl, _DEFAULT_PNL_TARGET
+            )
             self.assertFinite(coefficient, name=f"efficiency_coefficient[pnl={pnl}]")
             coefficients.append(coefficient)
             # Simplified reward calculation (ignoring other factors for this test)
diff --git a/ReforceXY/reward_space_analysis/tests/helpers/assertions.py b/ReforceXY/reward_space_analysis/tests/helpers/assertions.py
index 25c1b49..a362ba1 100644
--- a/ReforceXY/reward_space_analysis/tests/helpers/assertions.py
+++ b/ReforceXY/reward_space_analysis/tests/helpers/assertions.py
@@ -668,7 +668,9 @@ def assert_exit_mode_mathematical_validation(
     pnl_target_coefficient = _compute_pnl_target_coefficient(
         params, context.current_pnl, pnl_target, risk_reward_ratio
     )
-    efficiency_coefficient = _compute_efficiency_coefficient(params, context, context.current_pnl)
+    efficiency_coefficient = _compute_efficiency_coefficient(
+        params, context, context.current_pnl, pnl_target
+    )
 
     observed_exit_factor = _get_exit_factor(
         base_factor,
diff --git a/ReforceXY/user_data/freqaimodels/ReforceXY.py b/ReforceXY/user_data/freqaimodels/ReforceXY.py
index 174a78e..4af4079 100644
--- a/ReforceXY/user_data/freqaimodels/ReforceXY.py
+++ b/ReforceXY/user_data/freqaimodels/ReforceXY.py
@@ -186,6 +186,9 @@ class ReforceXY(BaseReinforcementLearningModel):
     DEFAULT_CHECK_INVARIANTS: Final[bool] = True
     DEFAULT_EXIT_FACTOR_THRESHOLD: Final[float] = 1_000.0
 
+    DEFAULT_EFFICIENCY_MIN_RANGE_EPSILON: Final[float] = 1e-6
+    DEFAULT_EFFICIENCY_MIN_RANGE_FRACTION: Final[float] = 0.01
+
     _MODEL_TYPES: Final[Tuple[ModelType, ...]] = (
         "PPO",
         "RecurrentPPO",
@@ -1940,6 +1943,12 @@ class MyRLEnv(Base5ActionRLEnv):
         self._potential_gamma = float(
             model_reward_parameters.get("potential_gamma", 0.95)
         )
+        if np.isclose(self._potential_gamma, 0.0):
+            logger.warning(
+                "PBRS [%s]: potential_gamma=0 detected; PBRS delta will be -Î¦(s) "
+                "instead of Î³Î¦(s')-Î¦(s). This may cause unexpected reward behavior.",
+                self.id,
+            )
 
         # === EXIT POTENTIAL MODE ===
         # exit_potential_mode options:
@@ -2083,6 +2092,16 @@ class MyRLEnv(Base5ActionRLEnv):
 
         # === PNL TARGET ===
         self._pnl_target = float(self.profit_aim * self.rr)
+        if self._pnl_target <= 0:
+            logger.warning(
+                "PBRS [%s]: pnl_target=%.6f must be > 0 (profit_aim=%.4f, rr=%.4f); "
+                "defaulting to 0.01",
+                self.id,
+                self._pnl_target,
+                self.profit_aim,
+                self.rr,
+            )
+            self._pnl_target = 0.01
 
     def _get_next_position(self, action: int) -> Positions:
         if action == Actions.Long_enter.value and self._position == Positions.Neutral:
@@ -3028,7 +3047,12 @@ class MyRLEnv(Base5ActionRLEnv):
             max_pnl = max(self.get_max_unrealized_profit(), pnl)
             min_pnl = min(self.get_min_unrealized_profit(), pnl)
             range_pnl = max_pnl - min_pnl
-            if np.isfinite(range_pnl) and not np.isclose(range_pnl, 0.0):
+            # Guard against division explosion when max_pnl â min_pnl
+            min_meaningful_range = max(
+                ReforceXY.DEFAULT_EFFICIENCY_MIN_RANGE_EPSILON,
+                ReforceXY.DEFAULT_EFFICIENCY_MIN_RANGE_FRACTION * self._pnl_target,
+            )
+            if np.isfinite(range_pnl) and range_pnl >= min_meaningful_range:
                 efficiency_ratio = (pnl - min_pnl) / range_pnl
                 if pnl > 0.0:
                     efficiency_coefficient = 1.0 + efficiency_weight * (