fix(reforcexy): avoid potential divide by 0

author Jérôme Benoit <jerome.benoit@piment-noir.org>

Mon, 20 Oct 2025 21:28:11 +0000 (23:28 +0200)

committer Jérôme Benoit <jerome.benoit@piment-noir.org>

Mon, 20 Oct 2025 21:28:11 +0000 (23:28 +0200)
author Jérôme Benoit <jerome.benoit@piment-noir.org>
Mon, 20 Oct 2025 21:28:11 +0000 (23:28 +0200)
committer Jérôme Benoit <jerome.benoit@piment-noir.org>
Mon, 20 Oct 2025 21:28:11 +0000 (23:28 +0200)
diff --git a/ReforceXY/reward_space_analysis/reward_space_analysis.py b/ReforceXY/reward_space_analysis/reward_space_analysis.py

index b118fe89c0771def63df386da9b294a85c00e29e..5ac7416d17a2001612005b54ecd7ebf63308e958 100644 (file)
--- a/ReforceXY/reward_space_analysis/reward_space_analysis.py
+++ b/ReforceXY/reward_space_analysis/reward_space_analysis.py
@@ -730,15 +730,15 @@ def _get_exit_factor(
              "exit_half_life",
              DEFAULT_MODEL_REWARD_PARAMETERS.get("exit_half_life", 0.5),
          )
-        if hl <= 0.0:
-            if _is_strict_validation(params):
-                raise ValueError(f"exit_half_life={hl} must be > 0 in strict mode")
+        if hl <= 0.0 and _is_strict_validation(params):
+            raise ValueError(f"exit_half_life={hl} must be > 0 in strict mode")
+        if np.isclose(hl, 0.0):
              warnings.warn(
-                f"exit_half_life={hl} <= 0; falling back to 0.0",
+                f"exit_half_life={hl} close to 0; falling back to 1.0",
                  RewardDiagnosticsWarning,
                  stacklevel=2,
              )
-            hl = 0.0
+            return 1.0
          return f * math.pow(2.0, -dr / hl)
  
      kernels = {
diff --git a/ReforceXY/user_data/freqaimodels/ReforceXY.py b/ReforceXY/user_data/freqaimodels/ReforceXY.py

index 9fa91b729084980753861b51f8efc26300a1974f..4b3be9ab506800a0d9d2578c2fdfdbd71ec4bb4c 100644 (file)
--- a/ReforceXY/user_data/freqaimodels/ReforceXY.py
+++ b/ReforceXY/user_data/freqaimodels/ReforceXY.py
@@ -2148,8 +2148,8 @@ class MyRLEnv(Base5ActionRLEnv):
  
          def _half_life(f: float, dr: float, p: Mapping) -> float:
              hl = float(p.get("exit_half_life", 0.5))
-            if hl <= 0.0:
-                hl = 0.0
+            if np.isclose(hl, 0.0) or hl < 0.0:
+                return 1.0
              return f * math.pow(2.0, -dr / hl)
  
          strategies: Dict[str, Callable[[float, float, Mapping], float]] = {
@@ -2522,14 +2522,14 @@ class MyRLEnv(Base5ActionRLEnv):
          if terminated:
              # Enforce Φ(terminal)=0 for PBRS invariance (Wiewiora et al. 2003)
              self._last_potential = 0.0
-            eps = 1e-6
-            if self.is_pbrs_invariant_mode() and abs(self._total_reward_shaping) > eps:
-                logger.warning(
-                    "PBRS mode %s invariance deviation: |sum Δ|=%.6f > eps=%.6f",
-                    self._exit_potential_mode,
-                    self._total_reward_shaping,
-                    eps,
-                )
+            # eps = np.finfo(float).eps
+            # if self.is_pbrs_invariant_mode() and abs(self._total_reward_shaping) > eps:
+            #     logger.warning(
+            #         "PBRS mode %s invariance deviation: |sum Δ|=%.6f > eps=%.6f",
+            #         self._exit_potential_mode,
+            #         abs(self._total_reward_shaping),
+            #         eps,
+            #     )
          return (
              self._get_observation(),
              reward,
author	Jérôme Benoit <jerome.benoit@piment-noir.org>
	Mon, 20 Oct 2025 21:28:11 +0000 (23:28 +0200)
committer	Jérôme Benoit <jerome.benoit@piment-noir.org>
	Mon, 20 Oct 2025 21:28:11 +0000 (23:28 +0200)
ReforceXY/reward_space_analysis/reward_space_analysis.py		patch \| blob \| blame \| history
ReforceXY/user_data/freqaimodels/ReforceXY.py		patch \| blob \| blame \| history