From 4677eee45aef7d0a0a3068a9fbcb0f80402ecf9d Mon Sep 17 00:00:00 2001
From: =?utf8?q?J=C3=A9r=C3=B4me=20Benoit?= <jerome.benoit@piment-noir.org>
Date: Mon, 20 Oct 2025 23:28:11 +0200
Subject: [PATCH] fix(reforcexy): avoid potential divide by 0
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

Signed-off-by: JÃ©rÃ´me Benoit <jerome.benoit@piment-noir.org>
---
 .../reward_space_analysis.py                  | 10 +++++-----
 ReforceXY/user_data/freqaimodels/ReforceXY.py | 20 +++++++++----------
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/ReforceXY/reward_space_analysis/reward_space_analysis.py b/ReforceXY/reward_space_analysis/reward_space_analysis.py
index b118fe8..5ac7416 100644
--- a/ReforceXY/reward_space_analysis/reward_space_analysis.py
+++ b/ReforceXY/reward_space_analysis/reward_space_analysis.py
@@ -730,15 +730,15 @@ def _get_exit_factor(
             "exit_half_life",
             DEFAULT_MODEL_REWARD_PARAMETERS.get("exit_half_life", 0.5),
         )
-        if hl <= 0.0:
-            if _is_strict_validation(params):
-                raise ValueError(f"exit_half_life={hl} must be > 0 in strict mode")
+        if hl <= 0.0 and _is_strict_validation(params):
+            raise ValueError(f"exit_half_life={hl} must be > 0 in strict mode")
+        if np.isclose(hl, 0.0):
             warnings.warn(
-                f"exit_half_life={hl} <= 0; falling back to 0.0",
+                f"exit_half_life={hl} close to 0; falling back to 1.0",
                 RewardDiagnosticsWarning,
                 stacklevel=2,
             )
-            hl = 0.0
+            return 1.0
         return f * math.pow(2.0, -dr / hl)
 
     kernels = {
diff --git a/ReforceXY/user_data/freqaimodels/ReforceXY.py b/ReforceXY/user_data/freqaimodels/ReforceXY.py
index 9fa91b7..4b3be9a 100644
--- a/ReforceXY/user_data/freqaimodels/ReforceXY.py
+++ b/ReforceXY/user_data/freqaimodels/ReforceXY.py
@@ -2148,8 +2148,8 @@ class MyRLEnv(Base5ActionRLEnv):
 
         def _half_life(f: float, dr: float, p: Mapping) -> float:
             hl = float(p.get("exit_half_life", 0.5))
-            if hl <= 0.0:
-                hl = 0.0
+            if np.isclose(hl, 0.0) or hl < 0.0:
+                return 1.0
             return f * math.pow(2.0, -dr / hl)
 
         strategies: Dict[str, Callable[[float, float, Mapping], float]] = {
@@ -2522,14 +2522,14 @@ class MyRLEnv(Base5ActionRLEnv):
         if terminated:
             # Enforce Î¦(terminal)=0 for PBRS invariance (Wiewiora et al. 2003)
             self._last_potential = 0.0
-            eps = 1e-6
-            if self.is_pbrs_invariant_mode() and abs(self._total_reward_shaping) > eps:
-                logger.warning(
-                    "PBRS mode %s invariance deviation: |sum Î|=%.6f > eps=%.6f",
-                    self._exit_potential_mode,
-                    self._total_reward_shaping,
-                    eps,
-                )
+            # eps = np.finfo(float).eps
+            # if self.is_pbrs_invariant_mode() and abs(self._total_reward_shaping) > eps:
+            #     logger.warning(
+            #         "PBRS mode %s invariance deviation: |sum Î|=%.6f > eps=%.6f",
+            #         self._exit_potential_mode,
+            #         abs(self._total_reward_shaping),
+            #         eps,
+            #     )
         return (
             self._get_observation(),
             reward,
-- 
2.43.0