From d65e51128fa5d2dfcba64b8fdcc63d98fc2bfe72 Mon Sep 17 00:00:00 2001 From: =?utf8?q?J=C3=A9r=C3=B4me=20Benoit?= Date: Wed, 15 Oct 2025 23:23:42 +0200 Subject: [PATCH] refactor(reforcexy): align PBRS params namespace MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Signed-off-by: Jérôme Benoit --- ReforceXY/reward_space_analysis/README.md | 6 +++--- .../reward_space_analysis/reward_space_analysis.py | 10 +++++----- ReforceXY/reward_space_analysis/test_cli.py | 2 +- .../test_reward_space_analysis.py | 4 ++-- ReforceXY/user_data/freqaimodels/ReforceXY.py | 12 ++++++------ 5 files changed, 17 insertions(+), 17 deletions(-) diff --git a/ReforceXY/reward_space_analysis/README.md b/ReforceXY/reward_space_analysis/README.md index 6ddaa4b..68e87e7 100644 --- a/ReforceXY/reward_space_analysis/README.md +++ b/ReforceXY/reward_space_analysis/README.md @@ -6,7 +6,7 @@ Deterministic synthetic sampling with diagnostics for reward shaping, penalties, - Scalable synthetic scenario generation (reproducible) - Reward component decomposition & bounds checks -- PBRS modes: canonical, non-canonical, progressive_release, spike_cancel, retain_previous +- PBRS modes: canonical, non_canonical, progressive_release, spike_cancel, retain_previous - Feature importance & optional partial dependence - Statistical tests (hypothesis, bootstrap CIs, distribution diagnostics) - Real vs synthetic shift metrics @@ -193,7 +193,7 @@ Core frequently tuned parameters: | `exit_linear_slope` | 1.0 | Linear kernel slope | | `exit_power_tau` | 0.5 | Tau controlling power kernel decay (0,1] | | `exit_half_life` | 0.5 | Half-life for half_life kernel | -| `potential_gamma` | 0.9 | PBRS discount γ | +| `potential_gamma` | 0.95 | PBRS discount γ | | `exit_potential_mode` | canonical | Exit potential mode | | `efficiency_weight` | 1.0 | Efficiency contribution weight | | `efficiency_center` | 0.5 | Efficiency pivot in [0,1] | @@ -391,7 +391,7 @@ python reward_space_analysis.py \ # Non-canonical PBRS (allows additives with Φ(terminal)=0, breaks invariance) python reward_space_analysis.py \ --num_samples 25000 \ - --params hold_potential_enabled=true entry_additive_enabled=true exit_additive_enabled=true exit_potential_mode=non-canonical \ + --params hold_potential_enabled=true entry_additive_enabled=true exit_additive_enabled=true exit_potential_mode=non_canonical \ --out_dir pbrs_non_canonical python reward_space_analysis.py \ diff --git a/ReforceXY/reward_space_analysis/reward_space_analysis.py b/ReforceXY/reward_space_analysis/reward_space_analysis.py index b51cd50..e878df7 100644 --- a/ReforceXY/reward_space_analysis/reward_space_analysis.py +++ b/ReforceXY/reward_space_analysis/reward_space_analysis.py @@ -78,7 +78,7 @@ ALLOWED_TRANSFORMS = { } ALLOWED_EXIT_POTENTIAL_MODES = { "canonical", - "non-canonical", + "non_canonical", "progressive_release", "spike_cancel", "retain_previous", @@ -115,7 +115,7 @@ DEFAULT_MODEL_REWARD_PARAMETERS: RewardParams = { # Potential-based reward shaping core parameters # Discount factor γ for potential term (0 ≤ γ ≤ 1) "potential_gamma": POTENTIAL_GAMMA_DEFAULT, - # Exit potential modes: canonical | non-canonical | progressive_release | spike_cancel | retain_previous + # Exit potential modes: canonical | non_canonical | progressive_release | spike_cancel | retain_previous "exit_potential_mode": "canonical", "exit_potential_decay": 0.5, # Hold potential (PBRS function Φ) @@ -160,7 +160,7 @@ DEFAULT_MODEL_REWARD_PARAMETERS_HELP: Dict[str, str] = { "exit_factor_threshold": "Warn if |exit_factor| exceeds", # PBRS parameters "potential_gamma": "PBRS discount γ (0–1)", - "exit_potential_mode": "Exit potential mode (canonical|non-canonical|progressive_release|spike_cancel|retain_previous)", + "exit_potential_mode": "Exit potential mode (canonical|non_canonical|progressive_release|spike_cancel|retain_previous)", "exit_potential_decay": "Decay for progressive_release (0–1)", "hold_potential_enabled": "Enable hold potential Φ", "hold_potential_scale": "Hold potential scale", @@ -2381,13 +2381,13 @@ def _compute_exit_additive( def _compute_exit_potential(last_potential: float, params: RewardParams) -> float: - """Exit potential per mode (canonical/non-canonical -> 0; others transform Φ).""" + """Exit potential per mode (canonical/non_canonical -> 0; others transform Φ).""" mode = _get_str_param( params, "exit_potential_mode", str(DEFAULT_MODEL_REWARD_PARAMETERS.get("exit_potential_mode", "canonical")), ) - if mode == "canonical" or mode == "non-canonical": + if mode == "canonical" or mode == "non_canonical": return _fail_safely("canonical_exit_potential") if mode == "progressive_release": diff --git a/ReforceXY/reward_space_analysis/test_cli.py b/ReforceXY/reward_space_analysis/test_cli.py index da0c37d..f7b40b4 100644 --- a/ReforceXY/reward_space_analysis/test_cli.py +++ b/ReforceXY/reward_space_analysis/test_cli.py @@ -79,7 +79,7 @@ def build_arg_matrix( ) -> List[ConfigTuple]: exit_potential_modes = [ "canonical", - "non-canonical", + "non_canonical", "progressive_release", "retain_previous", "spike_cancel", diff --git a/ReforceXY/reward_space_analysis/test_reward_space_analysis.py b/ReforceXY/reward_space_analysis/test_reward_space_analysis.py index 400a79f..c1f99b8 100644 --- a/ReforceXY/reward_space_analysis/test_reward_space_analysis.py +++ b/ReforceXY/reward_space_analysis/test_reward_space_analysis.py @@ -3398,13 +3398,13 @@ class TestReportFormatting(RewardSpaceTestBase): def test_additive_activation_deterministic_contribution(self): """Additives enabled increase total reward; shaping impact limited.""" - # Use a non-canonical exit mode to avoid automatic invariance enforcement + # Use a non_canonical exit mode to avoid automatic invariance enforcement # disabling the additive components on first call (canonical path auto-disables). base = self.base_params( hold_potential_enabled=True, entry_additive_enabled=False, exit_additive_enabled=False, - exit_potential_mode="non-canonical", + exit_potential_mode="non_canonical", ) with_add = base.copy() with_add.update( diff --git a/ReforceXY/user_data/freqaimodels/ReforceXY.py b/ReforceXY/user_data/freqaimodels/ReforceXY.py index da2c66b..08b4ac4 100644 --- a/ReforceXY/user_data/freqaimodels/ReforceXY.py +++ b/ReforceXY/user_data/freqaimodels/ReforceXY.py @@ -1363,7 +1363,7 @@ class MyRLEnv(Base5ActionRLEnv): # === EXIT POTENTIAL MODE === # exit_potential_mode options: # 'canonical' -> Φ(s')=0 (preserves invariance, disables additives) - # 'non-canonical' -> Φ(s')=0 (allows additives, breaks invariance) + # 'non_canonical' -> Φ(s')=0 (allows additives, breaks invariance) # 'progressive_release' -> Φ(s')=Φ(s)*(1-decay_factor) # 'spike_cancel' -> Φ(s')=Φ(s)/γ (Δ ≈ 0, cancels shaping) # 'retain_previous' -> Φ(s')=Φ(s) @@ -1372,7 +1372,7 @@ class MyRLEnv(Base5ActionRLEnv): ) _allowed_exit_modes = { "canonical", - "non-canonical", + "non_canonical", "progressive_release", "spike_cancel", "retain_previous", @@ -1439,11 +1439,11 @@ class MyRLEnv(Base5ActionRLEnv): if self._entry_additive_enabled or self._exit_additive_enabled: logger.info( "Canonical mode: additive rewards disabled with Φ(terminal)=0. PBRS invariance is preserved. " - "To use additive rewards, set exit_potential_mode='non-canonical'." + "To use additive rewards, set exit_potential_mode='non_canonical'." ) self._entry_additive_enabled = False self._exit_additive_enabled = False - elif self._exit_potential_mode == "non-canonical": + elif self._exit_potential_mode == "non_canonical": if self._entry_additive_enabled or self._exit_additive_enabled: logger.info( "Non-canonical mode: additive rewards enabled with Φ(terminal)=0. PBRS invariance is intentionally broken." @@ -1701,7 +1701,7 @@ class MyRLEnv(Base5ActionRLEnv): See ``_apply_potential_shaping`` for complete PBRS documentation. """ mode = self._exit_potential_mode - if mode == "canonical" or mode == "non-canonical": + if mode == "canonical" or mode == "non_canonical": return 0.0 if mode == "progressive_release": decay = self._exit_potential_decay @@ -1959,7 +1959,7 @@ class MyRLEnv(Base5ActionRLEnv): elif is_exit: if ( self._exit_potential_mode == "canonical" - or self._exit_potential_mode == "non-canonical" + or self._exit_potential_mode == "non_canonical" ): next_potential = 0.0 exit_shaping_reward = -prev_potential -- 2.43.0