]> Piment Noir Git Repositories - freqai-strategies.git/commitdiff
refactor(reforcexy): cleanup RSA implementation
authorJérôme Benoit <jerome.benoit@piment-noir.org>
Tue, 21 Oct 2025 11:19:26 +0000 (13:19 +0200)
committerJérôme Benoit <jerome.benoit@piment-noir.org>
Tue, 21 Oct 2025 11:19:26 +0000 (13:19 +0200)
Signed-off-by: Jérôme Benoit <jerome.benoit@piment-noir.org>
ReforceXY/reward_space_analysis/README.md
ReforceXY/reward_space_analysis/reward_space_analysis.py

index d8dcab7bf28f8ac9fa5a8d3e576588e782cd75cc..4f18b9e2dbac8e030b21a2d299760173b92041f3 100644 (file)
@@ -182,7 +182,7 @@ Core frequently tuned parameters:
 | `pnl_factor_beta` | 0.5 | PnL amplification beta |
 | `idle_penalty_scale` | 0.5 | Idle penalty scale |
 | `idle_penalty_power` | 1.025 | Idle penalty exponent |
-| `max_trade_duration_candles` | 128 | Trade duration cap | 
+| `max_trade_duration_candles` | 128 | Trade duration cap |
 | `max_idle_duration_candles` | None | Idle duration cap; fallback 4× max trade duration |
 | `hold_penalty_scale` | 0.25 | Hold penalty scale |
 | `hold_penalty_power` | 1.025 | Hold penalty exponent |
index 6ae5799a9948cb82a4569d411b4670a96341f735..576ed675f89c9611a2cbf52302f8073e803b77c1 100644 (file)
@@ -1082,13 +1082,15 @@ def calculate_reward(
     )
 
     if pbrs_enabled and not is_neutral:
-        # Derive Φ(prev) from current state to ensure telescoping semantics
-        prev_potential = _compute_hold_potential(current_pnl, current_duration_ratio, params)
-        if not np.isfinite(prev_potential):
-            prev_potential = 0.0
-        # Effective previous potential used for reporting: prefer provided previous_potential if finite
-        prev_potential = (
-            float(previous_potential) if np.isfinite(previous_potential) else float(prev_potential)
+        # Compute Φ(s) for the current state to preserve telescoping semantics Δ = γ·Φ(s') − Φ(s)
+        current_potential = _compute_hold_potential(current_pnl, current_duration_ratio, params)
+        if not np.isfinite(current_potential):
+            current_potential = 0.0
+
+        last_potential = (
+            float(previous_potential)
+            if np.isfinite(previous_potential)
+            else float(current_potential)
         )
 
         total_reward, reward_shaping, next_potential = apply_potential_shaping(
@@ -1099,12 +1101,13 @@ def calculate_reward(
             next_duration_ratio=next_duration_ratio,
             is_exit=is_exit,
             is_entry=is_entry,
-            previous_potential=previous_potential,
+            previous_potential=current_potential,
+            last_potential=last_potential,
             params=params,
         )
 
         breakdown.reward_shaping = reward_shaping
-        breakdown.prev_potential = prev_potential
+        breakdown.prev_potential = current_potential
         breakdown.next_potential = next_potential
         breakdown.entry_additive = (
             _compute_entry_additive(next_pnl, next_duration_ratio, params) if is_entry else 0.0
@@ -2095,9 +2098,11 @@ def statistical_hypothesis_tests(
         }
 
     # Optional multiple testing correction (Benjamini-Hochberg)
-    if adjust_method not in {"none", "benjamini_hochberg"}:
-        raise ValueError("Unsupported adjust_method. Use 'none' or 'benjamini_hochberg'.")
-    if adjust_method == "benjamini_hochberg" and results:
+    if adjust_method not in {"none", "benjamini_hochberg", "benjaminihochberg"}:
+        raise ValueError(
+            "Unsupported adjust_method. Use 'none', 'benjamini_hochberg', or 'benjaminihochberg'."
+        )
+    if adjust_method in {"benjamini_hochberg", "benjaminihochberg"} and results:
         # Collect p-values
         items = list(results.items())
         pvals = np.array([v[1]["p_value"] for v in items])