fix(ReforceXY): add context-aware guard for efficiency coefficient division

author Jérôme Benoit <jerome.benoit@piment-noir.org>

Thu, 12 Feb 2026 23:10:08 +0000 (00:10 +0100)

committer Jérôme Benoit <jerome.benoit@piment-noir.org>

Thu, 12 Feb 2026 23:10:08 +0000 (00:10 +0100)
author Jérôme Benoit <jerome.benoit@piment-noir.org>
Thu, 12 Feb 2026 23:10:08 +0000 (00:10 +0100)
committer Jérôme Benoit <jerome.benoit@piment-noir.org>
Thu, 12 Feb 2026 23:10:08 +0000 (00:10 +0100)
diff --git a/ReforceXY/reward_space_analysis/reward_space_analysis.py b/ReforceXY/reward_space_analysis/reward_space_analysis.py

index bef107468116fdc5ed099e5ddb6e820f2088ac56..2c61c2ab26546b6ee37e8cd12f2193fc3f52b9c6 100644 (file)
--- a/ReforceXY/reward_space_analysis/reward_space_analysis.py
+++ b/ReforceXY/reward_space_analysis/reward_space_analysis.py
@@ -99,6 +99,8 @@ INTERNAL_GUARDS: dict[str, float] = {
      "sim_extreme_pnl_threshold": 0.2,
      "histogram_epsilon": 1e-10,
      "distribution_identity_epsilon": 1e-12,
+    "efficiency_min_range_epsilon": 1e-6,
+    "efficiency_min_range_fraction": 0.01,
  }
  
  # PBRS constants
@@ -943,7 +945,7 @@ def _get_exit_factor(
              pnl_target,
              risk_reward_ratio,
          )
-        * _compute_efficiency_coefficient(params, context, pnl)
+        * _compute_efficiency_coefficient(params, context, pnl, pnl_target)
      )
  
      if _get_bool_param(
@@ -1013,6 +1015,7 @@ def _compute_efficiency_coefficient(
      params: RewardParams,
      context: RewardContext,
      pnl: float,
+    pnl_target: float,
  ) -> float:
      """
      Compute exit efficiency coefficient based on PnL position relative to unrealized extremes.
@@ -1027,6 +1030,7 @@ def _compute_efficiency_coefficient(
              - efficiency_center: Target efficiency ratio (0.0-1.0)
          context: Trade context with unrealized profit/loss extremes
          pnl: Realized profit/loss
+        pnl_target: Target profit threshold for context-aware range validation
  
      Returns:
          float: Coefficient ≥ 0.0 (typically 0.5-1.5 range)
@@ -1038,7 +1042,11 @@ def _compute_efficiency_coefficient(
          max_pnl = max(context.max_unrealized_profit, pnl)
          min_pnl = min(context.min_unrealized_profit, pnl)
          range_pnl = max_pnl - min_pnl
-        if np.isfinite(range_pnl) and not np.isclose(range_pnl, 0.0):
+        # Guard against division explosion when max_pnl ≈ min_pnl
+        eps = float(INTERNAL_GUARDS.get("efficiency_min_range_epsilon", 1e-6))
+        frac = float(INTERNAL_GUARDS.get("efficiency_min_range_fraction", 0.01))
+        min_meaningful_range = max(eps, frac * pnl_target)
+        if np.isfinite(range_pnl) and range_pnl >= min_meaningful_range:
              efficiency_ratio = (pnl - min_pnl) / range_pnl
              # For profits (pnl > 0): high ratio = good exit → higher coefficient → amplify gain
              # For losses (pnl < 0): high ratio = good exit → LOWER coefficient → attenuate penalty
diff --git a/ReforceXY/reward_space_analysis/tests/components/test_reward_components.py b/ReforceXY/reward_space_analysis/tests/components/test_reward_components.py

index 3004f7513ed8135adcac82808cffd5f69d993661..c87bfb856ba1e134929bf3a5f75c587ba70db84b 100644 (file)
--- a/ReforceXY/reward_space_analysis/tests/components/test_reward_components.py
+++ b/ReforceXY/reward_space_analysis/tests/components/test_reward_components.py
@@ -33,6 +33,8 @@ from ..helpers import (
  )
  from ..test_base import RewardSpaceTestBase
  
+_DEFAULT_PNL_TARGET = PARAMS.PROFIT_AIM * PARAMS.RISK_REWARD_RATIO
+
  pytestmark = pytest.mark.components
  
  
@@ -301,7 +303,9 @@ class TestRewardComponents(RewardSpaceTestBase):
              action=Actions.Long_exit,
          )
  
-        coefficient = _compute_efficiency_coefficient(params, ctx, ctx.current_pnl)
+        coefficient = _compute_efficiency_coefficient(
+            params, ctx, ctx.current_pnl, _DEFAULT_PNL_TARGET
+        )
  
          self.assertFinite(coefficient, name="efficiency_coefficient")
          self.assertAlmostEqualFloat(coefficient, 1.0, tolerance=TOLERANCE.GENERIC_EQ)
@@ -339,7 +343,9 @@ class TestRewardComponents(RewardSpaceTestBase):
                  position=Positions.Long,
                  action=Actions.Long_exit,
              )
-            coefficient = _compute_efficiency_coefficient(params, ctx, ctx.current_pnl)
+            coefficient = _compute_efficiency_coefficient(
+                params, ctx, ctx.current_pnl, _DEFAULT_PNL_TARGET
+            )
              self.assertFinite(coefficient, name=f"efficiency_coefficient[pnl={pnl}]")
              coefficients.append(coefficient)
  
@@ -420,7 +426,9 @@ class TestRewardComponents(RewardSpaceTestBase):
                  position=Positions.Long,
                  action=Actions.Long_exit,
              )
-            coefficient = _compute_efficiency_coefficient(params, ctx, ctx.current_pnl)
+            coefficient = _compute_efficiency_coefficient(
+                params, ctx, ctx.current_pnl, _DEFAULT_PNL_TARGET
+            )
              self.assertFinite(coefficient, name=f"efficiency_coefficient[pnl={pnl}]")
              coefficients.append(coefficient)
              # Simplified reward calculation (ignoring other factors for this test)
diff --git a/ReforceXY/reward_space_analysis/tests/helpers/assertions.py b/ReforceXY/reward_space_analysis/tests/helpers/assertions.py

index 25c1b499f7c6ef3436a7515f38ffdf312c8eb4ec..a362ba1e7d6c8e27dcb1721da7633e08a224aee9 100644 (file)
--- a/ReforceXY/reward_space_analysis/tests/helpers/assertions.py
+++ b/ReforceXY/reward_space_analysis/tests/helpers/assertions.py
@@ -668,7 +668,9 @@ def assert_exit_mode_mathematical_validation(
      pnl_target_coefficient = _compute_pnl_target_coefficient(
          params, context.current_pnl, pnl_target, risk_reward_ratio
      )
-    efficiency_coefficient = _compute_efficiency_coefficient(params, context, context.current_pnl)
+    efficiency_coefficient = _compute_efficiency_coefficient(
+        params, context, context.current_pnl, pnl_target
+    )
  
      observed_exit_factor = _get_exit_factor(
          base_factor,
diff --git a/ReforceXY/user_data/freqaimodels/ReforceXY.py b/ReforceXY/user_data/freqaimodels/ReforceXY.py

index 174a78e1694d63635afe61d9464563f42db924c3..4af407976a7081562afd56730328c9bfc07b4ce9 100644 (file)
--- a/ReforceXY/user_data/freqaimodels/ReforceXY.py
+++ b/ReforceXY/user_data/freqaimodels/ReforceXY.py
@@ -186,6 +186,9 @@ class ReforceXY(BaseReinforcementLearningModel):
      DEFAULT_CHECK_INVARIANTS: Final[bool] = True
      DEFAULT_EXIT_FACTOR_THRESHOLD: Final[float] = 1_000.0
  
+    DEFAULT_EFFICIENCY_MIN_RANGE_EPSILON: Final[float] = 1e-6
+    DEFAULT_EFFICIENCY_MIN_RANGE_FRACTION: Final[float] = 0.01
+
      _MODEL_TYPES: Final[Tuple[ModelType, ...]] = (
          "PPO",
          "RecurrentPPO",
@@ -1940,6 +1943,12 @@ class MyRLEnv(Base5ActionRLEnv):
          self._potential_gamma = float(
              model_reward_parameters.get("potential_gamma", 0.95)
          )
+        if np.isclose(self._potential_gamma, 0.0):
+            logger.warning(
+                "PBRS [%s]: potential_gamma=0 detected; PBRS delta will be -Φ(s) "
+                "instead of γΦ(s')-Φ(s). This may cause unexpected reward behavior.",
+                self.id,
+            )
  
          # === EXIT POTENTIAL MODE ===
          # exit_potential_mode options:
@@ -2083,6 +2092,16 @@ class MyRLEnv(Base5ActionRLEnv):
  
          # === PNL TARGET ===
          self._pnl_target = float(self.profit_aim * self.rr)
+        if self._pnl_target <= 0:
+            logger.warning(
+                "PBRS [%s]: pnl_target=%.6f must be > 0 (profit_aim=%.4f, rr=%.4f); "
+                "defaulting to 0.01",
+                self.id,
+                self._pnl_target,
+                self.profit_aim,
+                self.rr,
+            )
+            self._pnl_target = 0.01
  
      def _get_next_position(self, action: int) -> Positions:
          if action == Actions.Long_enter.value and self._position == Positions.Neutral:
@@ -3028,7 +3047,12 @@ class MyRLEnv(Base5ActionRLEnv):
              max_pnl = max(self.get_max_unrealized_profit(), pnl)
              min_pnl = min(self.get_min_unrealized_profit(), pnl)
              range_pnl = max_pnl - min_pnl
-            if np.isfinite(range_pnl) and not np.isclose(range_pnl, 0.0):
+            # Guard against division explosion when max_pnl ≈ min_pnl
+            min_meaningful_range = max(
+                ReforceXY.DEFAULT_EFFICIENCY_MIN_RANGE_EPSILON,
+                ReforceXY.DEFAULT_EFFICIENCY_MIN_RANGE_FRACTION * self._pnl_target,
+            )
+            if np.isfinite(range_pnl) and range_pnl >= min_meaningful_range:
                  efficiency_ratio = (pnl - min_pnl) / range_pnl
                  if pnl > 0.0:
                      efficiency_coefficient = 1.0 + efficiency_weight * (
author	Jérôme Benoit <jerome.benoit@piment-noir.org>
	Thu, 12 Feb 2026 23:10:08 +0000 (00:10 +0100)
committer	Jérôme Benoit <jerome.benoit@piment-noir.org>
	Thu, 12 Feb 2026 23:10:08 +0000 (00:10 +0100)
ReforceXY/reward_space_analysis/reward_space_analysis.py		patch \| blob \| blame \| history
ReforceXY/reward_space_analysis/tests/components/test_reward_components.py		patch \| blob \| blame \| history
ReforceXY/reward_space_analysis/tests/helpers/assertions.py		patch \| blob \| blame \| history
ReforceXY/user_data/freqaimodels/ReforceXY.py		patch \| blob \| blame \| history