refactor(ReforceXY): factor out reward tunable defaults

author Jérôme Benoit <jerome.benoit@piment-noir.org>

Tue, 16 Dec 2025 10:13:32 +0000 (11:13 +0100)

committer Jérôme Benoit <jerome.benoit@piment-noir.org>

Tue, 16 Dec 2025 10:13:32 +0000 (11:13 +0100)
author Jérôme Benoit <jerome.benoit@piment-noir.org>
Tue, 16 Dec 2025 10:13:32 +0000 (11:13 +0100)
committer Jérôme Benoit <jerome.benoit@piment-noir.org>
Tue, 16 Dec 2025 10:13:32 +0000 (11:13 +0100)
diff --git a/ReforceXY/reward_space_analysis/reward_space_analysis.py b/ReforceXY/reward_space_analysis/reward_space_analysis.py

index b60344f0173e2853cd1bd1b7bb75fbb740c59605..f8a2cc6b52277dbc923fd28fa08d25a022d9b450 100644 (file)
--- a/ReforceXY/reward_space_analysis/reward_space_analysis.py
+++ b/ReforceXY/reward_space_analysis/reward_space_analysis.py
@@ -707,11 +707,11 @@ def _get_exit_factor(
      )
      if exit_linear_slope < 0.0:
          warnings.warn(
-            "exit_linear_slope < 0; falling back to 0.0",
+            "exit_linear_slope < 0; falling back to 1.0",
              RewardDiagnosticsWarning,
              stacklevel=2,
          )
-        exit_linear_slope = 0.0
+        exit_linear_slope = 1.0
  
      def _legacy_kernel(f: float, dr: float) -> float:
          return f * (1.5 if dr <= 1.0 else 0.5)
@@ -996,12 +996,13 @@ def calculate_reward(
      if "risk_reward_ratio" in params:
          risk_reward_ratio = _get_float_param(params, "risk_reward_ratio", float(risk_reward_ratio))
  
-    profit_target_final = profit_target * risk_reward_ratio
-    idle_factor = factor * profit_target_final / 4.0
+    pnl_target = float(profit_target * risk_reward_ratio)
+
+    idle_factor = factor * pnl_target / 4.0
      pnl_factor = _get_pnl_factor(
          params,
          context,
-        profit_target_final,
+        pnl_target,
          risk_reward_ratio,
      )
      hold_factor = idle_factor
@@ -1098,7 +1099,9 @@ def calculate_reward(
  
      if pbrs_enabled and not is_neutral:
          # Compute Φ(s) for the current state to preserve telescoping semantics Δ = γ·Φ(s') − Φ(s)
-        current_potential = _compute_hold_potential(current_pnl, current_duration_ratio, params)
+        current_potential = _compute_hold_potential(
+            current_pnl, pnl_target, current_duration_ratio, params
+        )
          if not np.isfinite(current_potential):
              current_potential = 0.0
  
@@ -1112,6 +1115,7 @@ def calculate_reward(
              apply_potential_shaping(
                  base_reward=base_reward,
                  current_pnl=current_pnl,
+                pnl_target=pnl_target,
                  current_duration_ratio=current_duration_ratio,
                  next_pnl=next_pnl,
                  next_duration_ratio=next_duration_ratio,
@@ -2648,7 +2652,12 @@ def _get_potential_gamma(params: RewardParams) -> float:
  # === PBRS IMPLEMENTATION ===
  
  
-def _compute_hold_potential(pnl: float, duration_ratio: float, params: RewardParams) -> float:
+def _compute_hold_potential(
+    pnl: float,
+    pnl_target: float,
+    duration_ratio: float,
+    params: RewardParams,
+) -> float:
      """Compute PBRS hold potential Φ(s)."""
      if not _get_bool_param(
          params,
@@ -2659,6 +2668,7 @@ def _compute_hold_potential(pnl: float, duration_ratio: float, params: RewardPar
      return _compute_bi_component(
          kind="hold_potential",
          pnl=pnl,
+        pnl_target=pnl_target,
          duration_ratio=duration_ratio,
          params=params,
          scale_key="hold_potential_scale",
@@ -2669,7 +2679,12 @@ def _compute_hold_potential(pnl: float, duration_ratio: float, params: RewardPar
      )
  
  
-def _compute_entry_additive(pnl: float, duration_ratio: float, params: RewardParams) -> float:
+def _compute_entry_additive(
+    pnl: float,
+    pnl_target: float,
+    duration_ratio: float,
+    params: RewardParams,
+) -> float:
      if not _get_bool_param(
          params,
          "entry_additive_enabled",
@@ -2679,6 +2694,7 @@ def _compute_entry_additive(pnl: float, duration_ratio: float, params: RewardPar
      return _compute_bi_component(
          kind="entry_additive",
          pnl=pnl,
+        pnl_target=pnl_target,
          duration_ratio=duration_ratio,
          params=params,
          scale_key="entry_additive_scale",
@@ -2689,7 +2705,12 @@ def _compute_entry_additive(pnl: float, duration_ratio: float, params: RewardPar
      )
  
  
-def _compute_exit_additive(pnl: float, duration_ratio: float, params: RewardParams) -> float:
+def _compute_exit_additive(
+    pnl: float,
+    pnl_target: float,
+    duration_ratio: float,
+    params: RewardParams,
+) -> float:
      if not _get_bool_param(
          params,
          "exit_additive_enabled",
@@ -2699,6 +2720,7 @@ def _compute_exit_additive(pnl: float, duration_ratio: float, params: RewardPara
      return _compute_bi_component(
          kind="exit_additive",
          pnl=pnl,
+        pnl_target=pnl_target,
          duration_ratio=duration_ratio,
          params=params,
          scale_key="exit_additive_scale",
@@ -2766,6 +2788,7 @@ def _compute_exit_potential(last_potential: float, params: RewardParams) -> floa
  def apply_potential_shaping(
      base_reward: float,
      current_pnl: float,
+    pnl_target: float,
      current_duration_ratio: float,
      next_pnl: float,
      next_duration_ratio: float,
@@ -2785,7 +2808,7 @@ def apply_potential_shaping(
  
      Notes
      -----
-    - Shaping Δ = γ·Φ(next) − Φ(prev) with prev = Φ(current_pnl, current_duration_ratio).
+    - Shaping Δ = γ·Φ(next) − Φ(prev).
      - previous_potential:
          Previously computed Φ(s) for the prior transition. When provided and finite, it
          is used as Φ(prev) in Δ; otherwise Φ(prev) is derived from the current state.
@@ -2805,7 +2828,7 @@ def apply_potential_shaping(
      prev_term = (
          float(previous_potential)
          if np.isfinite(previous_potential)
-        else _compute_hold_potential(current_pnl, current_duration_ratio, params)
+        else _compute_hold_potential(current_pnl, pnl_target, current_duration_ratio, params)
      )
      if not np.isfinite(prev_term):
          prev_term = 0.0
@@ -2818,16 +2841,15 @@ def apply_potential_shaping(
          )
          next_potential = _compute_exit_potential(last_potential, params)
      else:
-        next_potential = _compute_hold_potential(next_pnl, next_duration_ratio, params)
+        next_potential = _compute_hold_potential(next_pnl, pnl_target, next_duration_ratio, params)
  
      # PBRS shaping Δ = γ·Φ(next) − Φ(prev)
      pbrs_delta = gamma * next_potential - float(prev_term)
      reward_shaping = pbrs_delta
  
      # Non-PBRS additives
-    # Pre-compute candidate additives (return 0.0 if corresponding feature disabled)
-    cand_entry_add = _compute_entry_additive(next_pnl, next_duration_ratio, params)
-    cand_exit_add = _compute_exit_additive(current_pnl, current_duration_ratio, params)
+    cand_entry_add = _compute_entry_additive(next_pnl, pnl_target, next_duration_ratio, params)
+    cand_exit_add = _compute_exit_additive(current_pnl, pnl_target, current_duration_ratio, params)
  
      entry_additive = cand_entry_add if is_entry else 0.0
      exit_additive = cand_exit_add if is_exit else 0.0
@@ -2891,6 +2913,7 @@ def _enforce_pbrs_invariance(params: RewardParams) -> RewardParams:
  def _compute_bi_component(
      kind: str,
      pnl: float,
+    pnl_target: float,
      duration_ratio: float,
      params: RewardParams,
      scale_key: str,
@@ -2900,11 +2923,20 @@ def _compute_bi_component(
      non_finite_key: str,
  ) -> float:
      """Generic helper for (pnl, duration) bi-component transforms."""
+    if not (np.isfinite(pnl) and np.isfinite(pnl_target) and np.isfinite(duration_ratio)):
+        return _fail_safely(non_finite_key)
+    if pnl_target <= 0.0:
+        return _fail_safely(f"{kind}_invalid_pnl_target")
+
+    pnl_ratio = float(pnl / pnl_target)
+    duration_ratio = float(np.clip(duration_ratio, 0.0, 1.0))
+
      scale = _get_float_param(params, scale_key, 1.0)
      gain = _get_float_param(params, gain_key, 1.0)
      transform_pnl = _get_str_param(params, transform_pnl_key, "tanh")
      transform_duration = _get_str_param(params, transform_dur_key, "tanh")
-    t_pnl = apply_transform(transform_pnl, gain * pnl)
+
+    t_pnl = apply_transform(transform_pnl, gain * pnl_ratio)
      t_dur = apply_transform(transform_duration, gain * duration_ratio)
      value = scale * 0.5 * (t_pnl + t_dur)
      if not np.isfinite(value):
diff --git a/ReforceXY/reward_space_analysis/tests/components/test_additives.py b/ReforceXY/reward_space_analysis/tests/components/test_additives.py

index e675f47916a1fb0a5702e87ed52e76336a21e5de..d1f3857f596eb9fc17d483088909541fb986635f 100644 (file)
--- a/ReforceXY/reward_space_analysis/tests/components/test_additives.py
+++ b/ReforceXY/reward_space_analysis/tests/components/test_additives.py
@@ -39,6 +39,7 @@ class TestAdditivesDeterministicContribution(RewardSpaceTestBase):
          ctx = {
              "base_reward": 0.05,
              "current_pnl": 0.01,
+            "pnl_target": self.TEST_PROFIT_TARGET,
              "current_duration_ratio": 0.2,
              "next_pnl": 0.012,
              "next_duration_ratio": 0.25,
diff --git a/ReforceXY/reward_space_analysis/tests/components/test_reward_components.py b/ReforceXY/reward_space_analysis/tests/components/test_reward_components.py

index 9bd41b97cf4ba3e4d3210cfe787924eb43aa5c90..ed0511aedceba042afd5009087819fc71114e761 100644 (file)
--- a/ReforceXY/reward_space_analysis/tests/components/test_reward_components.py
+++ b/ReforceXY/reward_space_analysis/tests/components/test_reward_components.py
@@ -43,7 +43,7 @@ class TestRewardComponents(RewardSpaceTestBase):
              "hold_potential_transform_pnl": "tanh",
              "hold_potential_transform_duration": "tanh",
          }
-        val = _compute_hold_potential(0.5, 0.3, params)
+        val = _compute_hold_potential(0.5, self.TEST_PROFIT_TARGET, 0.3, params)
          self.assertFinite(val, name="hold_potential")
  
      def test_hold_penalty_basic_calculation(self):
diff --git a/ReforceXY/reward_space_analysis/tests/pbrs/test_pbrs.py b/ReforceXY/reward_space_analysis/tests/pbrs/test_pbrs.py

index cf41459c795c6134c2d6152199815718b078e213..641ee74c40627bc4d5244add9cbefd63eab7be5e 100644 (file)
--- a/ReforceXY/reward_space_analysis/tests/pbrs/test_pbrs.py
+++ b/ReforceXY/reward_space_analysis/tests/pbrs/test_pbrs.py
@@ -56,7 +56,8 @@ class TestPBRS(RewardSpaceTestBase):
          )
          current_pnl = 0.02
          current_dur = 0.5
-        prev_potential = _compute_hold_potential(current_pnl, current_dur, params)
+        pnl_target = self.TEST_PROFIT_TARGET
+        prev_potential = _compute_hold_potential(current_pnl, pnl_target, current_dur, params)
          (
              _total_reward,
              reward_shaping,
@@ -67,6 +68,7 @@ class TestPBRS(RewardSpaceTestBase):
          ) = apply_potential_shaping(
              base_reward=0.0,
              current_pnl=current_pnl,
+            pnl_target=pnl_target,
              current_duration_ratio=current_dur,
              next_pnl=0.0,
              next_duration_ratio=0.0,
@@ -94,7 +96,8 @@ class TestPBRS(RewardSpaceTestBase):
          )
          current_pnl = 0.015
          current_dur = 0.4
-        prev_potential = _compute_hold_potential(current_pnl, current_dur, params)
+        pnl_target = self.TEST_PROFIT_TARGET
+        prev_potential = _compute_hold_potential(current_pnl, pnl_target, current_dur, params)
          gamma = _get_float_param(
              params, "potential_gamma", DEFAULT_MODEL_REWARD_PARAMETERS.get("potential_gamma", 0.95)
          )
@@ -111,6 +114,7 @@ class TestPBRS(RewardSpaceTestBase):
          ) = apply_potential_shaping(
              base_reward=0.0,
              current_pnl=current_pnl,
+            pnl_target=pnl_target,
              current_duration_ratio=current_dur,
              next_pnl=0.0,
              next_duration_ratio=0.0,
@@ -186,10 +190,10 @@ class TestPBRS(RewardSpaceTestBase):
      def test_additive_components_disabled_return_zero(self):
          """Verifies entry/exit additives return zero when disabled."""
          params_entry = {"entry_additive_enabled": False, "entry_additive_scale": 1.0}
-        val_entry = _compute_entry_additive(0.5, 0.3, params_entry)
+        val_entry = _compute_entry_additive(0.5, self.TEST_PROFIT_TARGET, 0.3, params_entry)
          self.assertEqual(float(val_entry), 0.0)
          params_exit = {"exit_additive_enabled": False, "exit_additive_scale": 1.0}
-        val_exit = _compute_exit_additive(0.5, 0.3, params_exit)
+        val_exit = _compute_exit_additive(0.5, self.TEST_PROFIT_TARGET, 0.3, params_exit)
          self.assertEqual(float(val_exit), 0.0)
  
      def test_exit_potential_canonical(self):
@@ -209,6 +213,7 @@ class TestPBRS(RewardSpaceTestBase):
              apply_potential_shaping(
                  base_reward=base_reward,
                  current_pnl=current_pnl,
+                pnl_target=self.TEST_PROFIT_TARGET,
                  current_duration_ratio=current_duration_ratio,
                  next_pnl=next_pnl,
                  next_duration_ratio=next_duration_ratio,
@@ -230,6 +235,7 @@ class TestPBRS(RewardSpaceTestBase):
          self.assertPlacesEqual(next_potential, 0.0, places=12)
          current_potential = _compute_hold_potential(
              current_pnl,
+            self.TEST_PROFIT_TARGET,
              current_duration_ratio,
              {"hold_potential_enabled": True, "hold_potential_scale": 1.0},
          )
@@ -250,6 +256,7 @@ class TestPBRS(RewardSpaceTestBase):
          _t1, _s1, _n1, _pbrs_delta, _entry_additive, _exit_additive = apply_potential_shaping(
              base_reward=0.0,
              current_pnl=0.05,
+            pnl_target=self.TEST_PROFIT_TARGET,
              current_duration_ratio=0.3,
              next_pnl=0.0,
              next_duration_ratio=0.0,
@@ -271,6 +278,7 @@ class TestPBRS(RewardSpaceTestBase):
          _t2, _s2, _n2, _pbrs_delta2, _entry_additive2, _exit_additive2 = apply_potential_shaping(
              base_reward=0.0,
              current_pnl=0.02,
+            pnl_target=self.TEST_PROFIT_TARGET,
              current_duration_ratio=0.1,
              next_pnl=0.0,
              next_duration_ratio=0.0,
@@ -295,6 +303,7 @@ class TestPBRS(RewardSpaceTestBase):
              apply_potential_shaping(
                  base_reward=0.0,
                  current_pnl=0.0,
+                pnl_target=self.TEST_PROFIT_TARGET,
                  current_duration_ratio=0.0,
                  next_pnl=0.0,
                  next_duration_ratio=0.0,
@@ -321,6 +330,7 @@ class TestPBRS(RewardSpaceTestBase):
          res_nan = apply_potential_shaping(
              base_reward=0.1,
              current_pnl=0.03,
+            pnl_target=self.TEST_PROFIT_TARGET,
              current_duration_ratio=0.2,
              next_pnl=0.035,
              next_duration_ratio=0.25,
@@ -332,6 +342,7 @@ class TestPBRS(RewardSpaceTestBase):
          res_ref = apply_potential_shaping(
              base_reward=0.1,
              current_pnl=0.03,
+            pnl_target=self.TEST_PROFIT_TARGET,
              current_duration_ratio=0.2,
              next_pnl=0.035,
              next_duration_ratio=0.25,
@@ -413,7 +424,9 @@ class TestPBRS(RewardSpaceTestBase):
          ctx_pnl = 0.012
          ctx_dur_ratio = 0.3
          params_can = self.base_params(exit_potential_mode="canonical", **base_common)
-        prev_phi = _compute_hold_potential(ctx_pnl, ctx_dur_ratio, params_can)
+        prev_phi = _compute_hold_potential(
+            ctx_pnl, self.TEST_PROFIT_TARGET, ctx_dur_ratio, params_can
+        )
          self.assertFinite(prev_phi, name="prev_phi")
          next_phi_can = _compute_exit_potential(prev_phi, params_can)
          self.assertAlmostEqualFloat(
@@ -669,6 +682,7 @@ class TestPBRS(RewardSpaceTestBase):
                  apply_potential_shaping(
                      base_reward=0.0,
                      current_pnl=0.02,
+                    pnl_target=self.TEST_PROFIT_TARGET,
                      current_duration_ratio=0.3,
                      next_pnl=0.025,
                      next_duration_ratio=0.35,
@@ -709,6 +723,7 @@ class TestPBRS(RewardSpaceTestBase):
                  apply_potential_shaping(
                      base_reward=0.0,
                      current_pnl=current_pnl,
+                    pnl_target=self.TEST_PROFIT_TARGET,
                      current_duration_ratio=current_dur,
                      next_pnl=next_pnl,
                      next_duration_ratio=next_dur,
@@ -762,6 +777,7 @@ class TestPBRS(RewardSpaceTestBase):
                  apply_potential_shaping(
                      base_reward=0.0,
                      current_pnl=float(rng.normal(0, 0.07)),
+                    pnl_target=self.TEST_PROFIT_TARGET,
                      current_duration_ratio=float(rng.uniform(0, 1)),
                      next_pnl=next_pnl,
                      next_duration_ratio=next_dur,
@@ -1060,11 +1076,11 @@ class TestPBRS(RewardSpaceTestBase):
  
              comp_share = _pd.Series([], dtype=float)
              action_summary = _pd.DataFrame(
-                columns=["count", "mean", "std", "min", "max"],
+                columns=_pd.Index(["count", "mean", "std", "min", "max"]),
                  index=_pd.Index([], name="action"),
              )
              component_bounds = _pd.DataFrame(
-                columns=["component_min", "component_mean", "component_max"],
+                columns=_pd.Index(["component_min", "component_mean", "component_max"]),
                  index=_pd.Index([], name="component"),
              )
              global_stats = _pd.Series([], dtype=float)
diff --git a/ReforceXY/reward_space_analysis/tests/robustness/test_robustness.py b/ReforceXY/reward_space_analysis/tests/robustness/test_robustness.py

index cbe4fee14f36fe5cd76339d5cc4794ef98e2faab..e6176a3a2aea10741776ab43f9fbb1db657a9085 100644 (file)
--- a/ReforceXY/reward_space_analysis/tests/robustness/test_robustness.py
+++ b/ReforceXY/reward_space_analysis/tests/robustness/test_robustness.py
@@ -246,7 +246,7 @@ class TestRewardRobustnessAndBoundaries(RewardSpaceTestBase):
          )
  
      def test_negative_slope_sanitization(self):
-        """Negative exit_linear_slope is sanitized to 0.0; resulting exit factors must match slope=0.0 within tolerance."""
+        """Negative exit_linear_slope is sanitized to 1.0; resulting exit factors must match slope=1.0 within tolerance."""
          base_factor = 100.0
          pnl = 0.03
          pnl_factor = 1.0
@@ -255,7 +255,7 @@ class TestRewardRobustnessAndBoundaries(RewardSpaceTestBase):
              exit_attenuation_mode="linear", exit_linear_slope=-5.0, exit_plateau=False
          )
          params_ref = self.base_params(
-            exit_attenuation_mode="linear", exit_linear_slope=0.0, exit_plateau=False
+            exit_attenuation_mode="linear", exit_linear_slope=1.0, exit_plateau=False
          )
          for dr in duration_ratios:
              f_bad = _get_exit_factor(base_factor, pnl, pnl_factor, dr, params_bad)
diff --git a/ReforceXY/reward_space_analysis/tests/test_base.py b/ReforceXY/reward_space_analysis/tests/test_base.py

index 04beff2741697887cab625dda8746f211c23be16..d65c0cdb3636980238873a53a2257894de71c146 100644 (file)
--- a/ReforceXY/reward_space_analysis/tests/test_base.py
+++ b/ReforceXY/reward_space_analysis/tests/test_base.py
@@ -163,6 +163,7 @@ class RewardSpaceTestBase(unittest.TestCase):
                  apply_potential_shaping(
                      base_reward=0.0,
                      current_pnl=current_pnl,
+                    pnl_target=self.TEST_PROFIT_TARGET,
                      current_duration_ratio=current_dur,
                      next_pnl=next_pnl,
                      next_duration_ratio=next_dur,
diff --git a/ReforceXY/user_data/freqaimodels/ReforceXY.py b/ReforceXY/user_data/freqaimodels/ReforceXY.py

index 8a73ac9a38c4a2aa995a44c44cb55b07fa48d4f6..43ae8f77a7baf585c2729212b3a02644a931ebd0 100644 (file)
--- a/ReforceXY/user_data/freqaimodels/ReforceXY.py
+++ b/ReforceXY/user_data/freqaimodels/ReforceXY.py
@@ -149,11 +149,40 @@ class ReforceXY(BaseReinforcementLearningModel):
  
      _LOG_2: Final[float] = math.log(2.0)
  
+    DEFAULT_MAX_TRADE_DURATION_CANDLES: Final[int] = 128
      DEFAULT_IDLE_DURATION_MULTIPLIER: Final[int] = 4
+
      DEFAULT_BASE_FACTOR: Final[float] = 100.0
-    DEFAULT_HOLD_POTENTIAL_SCALE: Final[float] = 1.0
      DEFAULT_EFFICIENCY_WEIGHT: Final[float] = 1.0
-    DEFAULT_MAX_TRADE_DURATION_CANDLES: Final[int] = 128
+
+    DEFAULT_EXIT_POTENTIAL_DECAY: Final[float] = 0.5
+    DEFAULT_ENTRY_ADDITIVE_ENABLED: Final[bool] = False
+    DEFAULT_ENTRY_ADDITIVE_SCALE: Final[float] = 1.0
+    DEFAULT_ENTRY_ADDITIVE_GAIN: Final[float] = 1.0
+    DEFAULT_HOLD_POTENTIAL_ENABLED: Final[bool] = True
+    DEFAULT_HOLD_POTENTIAL_SCALE: Final[float] = 1.0
+    DEFAULT_HOLD_POTENTIAL_GAIN: Final[float] = 1.0
+    DEFAULT_EXIT_ADDITIVE_ENABLED: Final[bool] = False
+    DEFAULT_EXIT_ADDITIVE_SCALE: Final[float] = 1.0
+    DEFAULT_EXIT_ADDITIVE_GAIN: Final[float] = 1.0
+
+    DEFAULT_EXIT_PLATEAU: Final[bool] = True
+    DEFAULT_EXIT_PLATEAU_GRACE: Final[float] = 1.0
+    DEFAULT_EXIT_LINEAR_SLOPE: Final[float] = 1.0
+    DEFAULT_EXIT_HALF_LIFE: Final[float] = 0.5
+
+    DEFAULT_PNL_FACTOR_BETA: Final[float] = 0.5
+    DEFAULT_WIN_REWARD_FACTOR: Final[float] = 2.0
+    DEFAULT_EFFICIENCY_CENTER: Final[float] = 0.5
+
+    DEFAULT_INVALID_ACTION: Final[float] = -2.0
+    DEFAULT_IDLE_PENALTY_SCALE: Final[float] = 0.5
+    DEFAULT_IDLE_PENALTY_POWER: Final[float] = 1.025
+    DEFAULT_HOLD_PENALTY_SCALE: Final[float] = 0.25
+    DEFAULT_HOLD_PENALTY_POWER: Final[float] = 1.025
+
+    DEFAULT_CHECK_INVARIANTS: Final[bool] = True
+    DEFAULT_EXIT_FACTOR_THRESHOLD: Final[float] = 10_000.0
  
      _MODEL_TYPES: Final[Tuple[ModelType, ...]] = (
          "PPO",
@@ -1610,17 +1639,25 @@ class MyRLEnv(Base5ActionRLEnv):
                  0
              ]  # "canonical"
          self._exit_potential_decay: float = float(
-            model_reward_parameters.get("exit_potential_decay", 0.5)
+            model_reward_parameters.get(
+                "exit_potential_decay", ReforceXY.DEFAULT_EXIT_POTENTIAL_DECAY
+            )
          )
          # === ENTRY ADDITIVE (non-PBRS additive term) ===
          self._entry_additive_enabled: bool = bool(
-            model_reward_parameters.get("entry_additive_enabled", False)
+            model_reward_parameters.get(
+                "entry_additive_enabled", ReforceXY.DEFAULT_ENTRY_ADDITIVE_ENABLED
+            )
          )
          self._entry_additive_scale: float = float(
-            model_reward_parameters.get("entry_additive_scale", 1.0)
+            model_reward_parameters.get(
+                "entry_additive_scale", ReforceXY.DEFAULT_ENTRY_ADDITIVE_SCALE
+            )
          )
          self._entry_additive_gain: float = float(
-            model_reward_parameters.get("entry_additive_gain", 1.0)
+            model_reward_parameters.get(
+                "entry_additive_gain", ReforceXY.DEFAULT_ENTRY_ADDITIVE_GAIN
+            )
          )
          self._entry_additive_transform_pnl: TransformFunction = cast(
              TransformFunction,
@@ -1636,7 +1673,9 @@ class MyRLEnv(Base5ActionRLEnv):
          )
          # === HOLD POTENTIAL (PBRS function Φ) ===
          self._hold_potential_enabled: bool = bool(
-            model_reward_parameters.get("hold_potential_enabled", True)
+            model_reward_parameters.get(
+                "hold_potential_enabled", ReforceXY.DEFAULT_HOLD_POTENTIAL_ENABLED
+            )
          )
          self._hold_potential_scale: float = float(
              model_reward_parameters.get(
@@ -1644,7 +1683,9 @@ class MyRLEnv(Base5ActionRLEnv):
              )
          )
          self._hold_potential_gain: float = float(
-            model_reward_parameters.get("hold_potential_gain", 1.0)
+            model_reward_parameters.get(
+                "hold_potential_gain", ReforceXY.DEFAULT_HOLD_POTENTIAL_GAIN
+            )
          )
          self._hold_potential_transform_pnl: TransformFunction = cast(
              TransformFunction,
@@ -1660,13 +1701,19 @@ class MyRLEnv(Base5ActionRLEnv):
          )
          # === EXIT ADDITIVE (non-PBRS additive term) ===
          self._exit_additive_enabled: bool = bool(
-            model_reward_parameters.get("exit_additive_enabled", False)
+            model_reward_parameters.get(
+                "exit_additive_enabled", ReforceXY.DEFAULT_EXIT_ADDITIVE_ENABLED
+            )
          )
          self._exit_additive_scale: float = float(
-            model_reward_parameters.get("exit_additive_scale", 1.0)
+            model_reward_parameters.get(
+                "exit_additive_scale", ReforceXY.DEFAULT_EXIT_ADDITIVE_SCALE
+            )
          )
          self._exit_additive_gain: float = float(
-            model_reward_parameters.get("exit_additive_gain", 1.0)
+            model_reward_parameters.get(
+                "exit_additive_gain", ReforceXY.DEFAULT_EXIT_ADDITIVE_GAIN
+            )
          )
          self._exit_additive_transform_pnl: TransformFunction = cast(
              TransformFunction,
@@ -2335,9 +2382,13 @@ class MyRLEnv(Base5ActionRLEnv):
                  "exit_attenuation_mode", ReforceXY._EXIT_ATTENUATION_MODES[2]
              )  # "linear"
          )
-        exit_plateau = bool(model_reward_parameters.get("exit_plateau", True))
+        exit_plateau = bool(
+            model_reward_parameters.get("exit_plateau", ReforceXY.DEFAULT_EXIT_PLATEAU)
+        )
          exit_plateau_grace = float(
-            model_reward_parameters.get("exit_plateau_grace", 1.0)
+            model_reward_parameters.get(
+                "exit_plateau_grace", ReforceXY.DEFAULT_EXIT_PLATEAU_GRACE
+            )
          )
          if exit_plateau_grace < 0.0:
              exit_plateau_grace = 0.0
@@ -2349,7 +2400,9 @@ class MyRLEnv(Base5ActionRLEnv):
              return f / math.sqrt(1.0 + dr)
  
          def _linear(f: float, dr: float, p: Mapping[str, Any]) -> float:
-            slope = float(p.get("exit_linear_slope", 1.0))
+            slope = float(
+                p.get("exit_linear_slope", ReforceXY.DEFAULT_EXIT_LINEAR_SLOPE)
+            )
              if slope < 0.0:
                  slope = 1.0
              return f / (1.0 + slope * dr)
@@ -2367,7 +2420,7 @@ class MyRLEnv(Base5ActionRLEnv):
              return f / math.pow(1.0 + dr, alpha)
  
          def _half_life(f: float, dr: float, p: Mapping[str, Any]) -> float:
-            hl = float(p.get("exit_half_life", 0.5))
+            hl = float(p.get("exit_half_life", ReforceXY.DEFAULT_EXIT_HALF_LIFE))
              if np.isclose(hl, 0.0) or hl < 0.0:
                  return 1.0
              return f * math.pow(2.0, -dr / hl)
@@ -2436,7 +2489,9 @@ class MyRLEnv(Base5ActionRLEnv):
              pnl, self._pnl_target, model_reward_parameters
          )
  
-        check_invariants = model_reward_parameters.get("check_invariants", True)
+        check_invariants = model_reward_parameters.get(
+            "check_invariants", ReforceXY.DEFAULT_CHECK_INVARIANTS
+        )
          check_invariants = (
              check_invariants if isinstance(check_invariants, bool) else True
          )
@@ -2454,7 +2509,9 @@ class MyRLEnv(Base5ActionRLEnv):
                  )
                  factor = 0.0
              exit_factor_threshold = float(
-                model_reward_parameters.get("exit_factor_threshold", 10_000.0)
+                model_reward_parameters.get(
+                    "exit_factor_threshold", ReforceXY.DEFAULT_EXIT_FACTOR_THRESHOLD
+                )
              )
              if exit_factor_threshold > 0 and abs(factor) > exit_factor_threshold:
                  logger.warning(
@@ -2474,7 +2531,11 @@ class MyRLEnv(Base5ActionRLEnv):
          pnl_target_factor = 1.0
  
          if pnl_target > 0.0:
-            pnl_factor_beta = float(model_reward_parameters.get("pnl_factor_beta", 0.5))
+            pnl_factor_beta = float(
+                model_reward_parameters.get(
+                    "pnl_factor_beta", ReforceXY.DEFAULT_PNL_FACTOR_BETA
+                )
+            )
              pnl_ratio = pnl / pnl_target
  
              if abs(pnl_ratio) > 1.0:
@@ -2482,7 +2543,9 @@ class MyRLEnv(Base5ActionRLEnv):
                      pnl_factor_beta * (abs(pnl_ratio) - 1.0)
                  )
                  win_reward_factor = float(
-                    model_reward_parameters.get("win_reward_factor", 2.0)
+                    model_reward_parameters.get(
+                        "win_reward_factor", ReforceXY.DEFAULT_WIN_REWARD_FACTOR
+                    )
                  )
  
                  if pnl_ratio > 1.0:
@@ -2506,7 +2569,11 @@ class MyRLEnv(Base5ActionRLEnv):
                  "efficiency_weight", ReforceXY.DEFAULT_EFFICIENCY_WEIGHT
              )
          )
-        efficiency_center = float(model_reward_parameters.get("efficiency_center", 0.5))
+        efficiency_center = float(
+            model_reward_parameters.get(
+                "efficiency_center", ReforceXY.DEFAULT_EFFICIENCY_CENTER
+            )
+        )
  
          efficiency_factor = 1.0
          if efficiency_weight != 0.0 and not np.isclose(pnl, 0.0):
@@ -2532,9 +2599,6 @@ class MyRLEnv(Base5ActionRLEnv):
          """
          Combine PnL target and efficiency factors (>= 0.0)
          """
-        if not np.isfinite(pnl):
-            return 0.0
-
          pnl_target_factor = self._compute_pnl_target_factor(
              pnl, pnl_target, model_reward_parameters
          )
@@ -2580,13 +2644,19 @@ class MyRLEnv(Base5ActionRLEnv):
          # 1. Invalid action
          if not self.action_masking and not self._is_valid(action):
              self.tensorboard_log("invalid", category="actions")
-            base_reward = float(model_reward_parameters.get("invalid_action", -2.0))
+            base_reward = float(
+                model_reward_parameters.get(
+                    "invalid_action", ReforceXY.DEFAULT_INVALID_ACTION
+                )
+            )
              self._last_invalid_penalty = float(base_reward)
  
          max_trade_duration = max(1, self.max_trade_duration_candles)
          trade_duration = self.get_trade_duration()
          duration_ratio = trade_duration / max_trade_duration
-        base_factor = float(model_reward_parameters.get("base_factor", 100.0))
+        base_factor = float(
+            model_reward_parameters.get("base_factor", ReforceXY.DEFAULT_BASE_FACTOR)
+        )
          idle_factor = base_factor * self._pnl_target / 4.0
          hold_factor = idle_factor
  
@@ -2598,10 +2668,14 @@ class MyRLEnv(Base5ActionRLEnv):
          ):
              max_idle_duration = max(1, self.max_idle_duration_candles)
              idle_penalty_scale = float(
-                model_reward_parameters.get("idle_penalty_scale", 0.5)
+                model_reward_parameters.get(
+                    "idle_penalty_scale", ReforceXY.DEFAULT_IDLE_PENALTY_SCALE
+                )
              )
              idle_penalty_power = float(
-                model_reward_parameters.get("idle_penalty_power", 1.025)
+                model_reward_parameters.get(
+                    "idle_penalty_power", ReforceXY.DEFAULT_IDLE_PENALTY_POWER
+                )
              )
              idle_duration = self.get_idle_duration()
              idle_duration_ratio = idle_duration / max(1, max_idle_duration)
@@ -2619,10 +2693,14 @@ class MyRLEnv(Base5ActionRLEnv):
              and action == Actions.Neutral.value
          ):
              hold_penalty_scale = float(
-                model_reward_parameters.get("hold_penalty_scale", 0.25)
+                model_reward_parameters.get(
+                    "hold_penalty_scale", ReforceXY.DEFAULT_HOLD_PENALTY_SCALE
+                )
              )
              hold_penalty_power = float(
-                model_reward_parameters.get("hold_penalty_power", 1.025)
+                model_reward_parameters.get(
+                    "hold_penalty_power", ReforceXY.DEFAULT_HOLD_PENALTY_POWER
+                )
              )
              if duration_ratio < 1.0:
                  base_reward = 0.0
author	Jérôme Benoit <jerome.benoit@piment-noir.org>
	Tue, 16 Dec 2025 10:13:32 +0000 (11:13 +0100)
committer	Jérôme Benoit <jerome.benoit@piment-noir.org>
	Tue, 16 Dec 2025 10:13:32 +0000 (11:13 +0100)
ReforceXY/reward_space_analysis/reward_space_analysis.py		patch \| blob \| blame \| history
ReforceXY/reward_space_analysis/tests/components/test_additives.py		patch \| blob \| blame \| history
ReforceXY/reward_space_analysis/tests/components/test_reward_components.py		patch \| blob \| blame \| history
ReforceXY/reward_space_analysis/tests/pbrs/test_pbrs.py		patch \| blob \| blame \| history
ReforceXY/reward_space_analysis/tests/robustness/test_robustness.py		patch \| blob \| blame \| history
ReforceXY/reward_space_analysis/tests/test_base.py		patch \| blob \| blame \| history
ReforceXY/user_data/freqaimodels/ReforceXY.py		patch \| blob \| blame \| history