**/user_data/data/**
!.gitkeep
-*/.serena
+*/.serena/
*/.serena/**
*/.clinerules
*/.clinerules/**
| ---------------------------- | ------- | -------------------------- |
| `max_trade_duration_candles` | 128 | Trade duration cap |
| `max_idle_duration_candles` | None | Fallback 4× trade duration |
-| `idle_penalty_scale` | 0.5 | Idle penalty scale |
+| `idle_penalty_scale` | 1.0 | Idle penalty scale |
| `idle_penalty_power` | 1.025 | Idle penalty exponent |
-| `hold_penalty_scale` | 0.25 | Hold penalty scale |
+| `hold_penalty_scale` | 1.0 | Hold penalty scale |
| `hold_penalty_power` | 1.025 | Hold penalty exponent |
#### Validation
| Parameter | Default | Description |
| ----------------------------------- | ------- | -------------------- |
-| `hold_potential_scale` | 1.0 | Hold potential scale |
+| `hold_potential_ratio` | 0.25 | Hold potential ratio |
| `hold_potential_gain` | 1.0 | Gain multiplier |
| `hold_potential_transform_pnl` | tanh | PnL transform |
| `hold_potential_transform_duration` | tanh | Duration transform |
| Parameter | Default | Description |
| ----------------------------------- | ------- | --------------------- |
| `entry_additive_enabled` | false | Enable entry additive |
-| `entry_additive_scale` | 1.0 | Scale |
+| `entry_additive_ratio` | 0.125 | Ratio |
| `entry_additive_gain` | 1.0 | Gain |
| `entry_additive_transform_pnl` | tanh | PnL transform |
| `entry_additive_transform_duration` | tanh | Duration transform |
| Parameter | Default | Description |
| ---------------------------------- | ------- | -------------------- |
| `exit_additive_enabled` | false | Enable exit additive |
-| `exit_additive_scale` | 1.0 | Scale |
+| `exit_additive_ratio` | 0.125 | Ratio |
| `exit_additive_gain` | 1.0 | Gain |
| `exit_additive_transform_pnl` | tanh | PnL transform |
| `exit_additive_transform_duration` | tanh | Duration transform |
"invalid_action": -2.0,
"base_factor": 100.0,
# Idle penalty defaults
- "idle_penalty_scale": 0.5,
+ "idle_penalty_scale": 1.0,
"idle_penalty_power": 1.025,
"max_trade_duration_candles": 128,
# Fallback: DEFAULT_IDLE_DURATION_MULTIPLIER * max_trade_duration_candles
"max_idle_duration_candles": None,
# Hold penalty defaults
- "hold_penalty_scale": 0.25,
+ "hold_penalty_scale": 1.0,
"hold_penalty_power": 1.025,
# Exit attenuation defaults
"exit_attenuation_mode": "linear",
"exit_potential_decay": 0.5,
# Hold potential (PBRS function Φ)
"hold_potential_enabled": True,
- "hold_potential_scale": 1.0,
+ "hold_potential_ratio": 0.25,
"hold_potential_gain": 1.0,
"hold_potential_transform_pnl": "tanh",
"hold_potential_transform_duration": "tanh",
# Entry additive (non-PBRS additive term)
"entry_additive_enabled": False,
- "entry_additive_scale": 1.0,
+ "entry_additive_ratio": 0.125,
"entry_additive_gain": 1.0,
"entry_additive_transform_pnl": "tanh",
"entry_additive_transform_duration": "tanh",
"exit_fee_rate": 0.0,
# Exit additive (non-PBRS additive term)
"exit_additive_enabled": False,
- "exit_additive_scale": 1.0,
+ "exit_additive_ratio": 0.125,
"exit_additive_gain": 1.0,
"exit_additive_transform_pnl": "tanh",
"exit_additive_transform_duration": "tanh",
"exit_potential_mode": "Exit potential mode (canonical|non_canonical|progressive_release|spike_cancel|retain_previous)",
"exit_potential_decay": "Decay for progressive_release (0–1)",
"hold_potential_enabled": "Enable hold potential Φ",
- "hold_potential_scale": "Hold potential scale",
+ "hold_potential_ratio": "Hold potential ratio",
"hold_potential_gain": "Hold potential gain",
"hold_potential_transform_pnl": "Hold PnL transform",
"hold_potential_transform_duration": "Hold duration transform",
"entry_additive_enabled": "Enable entry additive",
- "entry_additive_scale": "Entry additive scale",
+ "entry_additive_ratio": "Entry additive ratio",
"entry_additive_gain": "Entry additive gain",
"entry_additive_transform_pnl": "Entry PnL transform",
"entry_additive_transform_duration": "Entry duration transform",
"entry_fee_rate": "Entry fee rate",
"exit_fee_rate": "Exit fee rate",
"exit_additive_enabled": "Enable exit additive",
- "exit_additive_scale": "Exit additive scale",
+ "exit_additive_ratio": "Exit additive ratio",
"exit_additive_gain": "Exit additive gain",
"exit_additive_transform_pnl": "Exit PnL transform",
"exit_additive_transform_duration": "Exit duration transform",
# PBRS parameter bounds
"potential_gamma": {"min": 0.0, "max": 1.0},
"exit_potential_decay": {"min": 0.0, "max": 1.0},
- "hold_potential_scale": {"min": 0.0},
+ "hold_potential_ratio": {"min": 0.0},
"hold_potential_gain": {"min": 0.0},
- "entry_additive_scale": {"min": 0.0},
+ "entry_additive_ratio": {"min": 0.0},
"entry_additive_gain": {"min": 0.0},
"entry_fee_rate": {"min": 0.0, "max": 0.1},
"exit_fee_rate": {"min": 0.0, "max": 0.1},
- "exit_additive_scale": {"min": 0.0},
+ "exit_additive_ratio": {"min": 0.0},
"exit_additive_gain": {"min": 0.0},
}
return entry_additive_effective, exit_additive_effective, additives_suppressed
-def _is_strict_validation(params: RewardParams) -> bool:
- """Return strict validation flag from params (default True)."""
- return _get_bool_param(params, "strict_validation", True)
-
-
def _get_float_param(params: RewardParams, key: str, default: RewardParamValue) -> float:
"""Extract float parameter with type safety and default fallback."""
value = params.get(key, default)
def _fail_safely(reason: str) -> float:
- """Return 0.0 on recoverable numeric failure."""
+ """Return 0.0 on numeric failure."""
_ = reason
return 0.0
return 1.0 / (1.0 + exit_linear_slope * dr)
def _power_kernel(dr: float) -> float:
- tau = _get_float_param(
- params,
- "exit_power_tau",
- DEFAULT_MODEL_REWARD_PARAMETERS.get("exit_power_tau", 0.5),
- )
- if 0.0 < tau <= 1.0:
- alpha = -math.log(tau) / _LOG_2
- else:
- if _is_strict_validation(params):
- raise ValueError(f"exit_power_tau={tau} must be in (0,1] in strict mode")
- warnings.warn(
- f"exit_power_tau={tau} invalid; falling back to alpha=1.0",
- RewardDiagnosticsWarning,
- stacklevel=2,
- )
+ raw_tau = params.get("exit_power_tau", None)
+ if raw_tau is None:
alpha = 1.0
+ else:
+ tau = _get_float_param(params, "exit_power_tau", np.nan)
+ if 0.0 < tau <= 1.0:
+ alpha = -math.log(tau) / _LOG_2
+ else:
+ warnings.warn(
+ f"exit_power_tau={tau} invalid; falling back to alpha=1.0",
+ RewardDiagnosticsWarning,
+ stacklevel=2,
+ )
+ alpha = 1.0
return 1.0 / math.pow(1.0 + dr, alpha)
def _half_life_kernel(dr: float) -> float:
"exit_half_life",
DEFAULT_MODEL_REWARD_PARAMETERS.get("exit_half_life", 0.5),
)
- if hl <= 0.0 and _is_strict_validation(params):
- raise ValueError(f"exit_half_life={hl} must be > 0 in strict mode")
if np.isclose(hl, 0.0):
warnings.warn(
f"exit_half_life={hl} close to 0; falling back to 1.0",
stacklevel=2,
)
return 1.0
+ if hl < 0.0:
+ warnings.warn(
+ f"exit_half_life={hl} negative; falling back to 1.0",
+ RewardDiagnosticsWarning,
+ stacklevel=2,
+ )
+ return 1.0
return math.pow(2.0, -dr / hl)
kernels = {
idle_penalty_scale = _get_float_param(
params,
"idle_penalty_scale",
- DEFAULT_MODEL_REWARD_PARAMETERS.get("idle_penalty_scale", 0.5),
+ DEFAULT_MODEL_REWARD_PARAMETERS.get("idle_penalty_scale", 1.0),
)
idle_penalty_power = _get_float_param(
params,
hold_penalty_scale = _get_float_param(
params,
"hold_penalty_scale",
- DEFAULT_MODEL_REWARD_PARAMETERS.get("hold_penalty_scale", 0.25),
+ DEFAULT_MODEL_REWARD_PARAMETERS.get("hold_penalty_scale", 1.0),
)
hold_penalty_power = _get_float_param(
params,
if "risk_reward_ratio" in params:
risk_reward_ratio = _get_float_param(params, "risk_reward_ratio", float(risk_reward_ratio))
+ elif "rr" in params:
+ risk_reward_ratio = _get_float_param(params, "rr", float(risk_reward_ratio))
pnl_target = float(profit_aim * risk_reward_ratio)
- idle_factor = base_factor * (profit_aim / risk_reward_ratio) / 4.0
+ idle_factor = base_factor * (profit_aim / risk_reward_ratio)
hold_factor = idle_factor
max_trade_duration_candles = _get_int_param(
prev_potential=prev_potential,
params=params,
risk_reward_ratio=risk_reward_ratio,
+ base_factor=base_factor,
)
)
duration_ratio: float,
risk_reward_ratio: float,
params: RewardParams,
+ base_factor: float,
) -> float:
"""Compute PBRS hold potential Φ(s)."""
if not _get_bool_param(
pnl_target=pnl_target,
duration_ratio=duration_ratio,
params=params,
- scale_key="hold_potential_scale",
+ scale_key="hold_potential_ratio",
gain_key="hold_potential_gain",
transform_pnl_key="hold_potential_transform_pnl",
transform_dur_key="hold_potential_transform_duration",
non_finite_key="non_finite_hold_potential",
risk_reward_ratio=risk_reward_ratio,
+ base_factor=base_factor,
)
pnl_target: float,
duration_ratio: float,
params: RewardParams,
+ base_factor: float,
) -> float:
if not _get_bool_param(
params,
pnl_target=pnl_target,
duration_ratio=duration_ratio,
params=params,
- scale_key="entry_additive_scale",
+ scale_key="entry_additive_ratio",
gain_key="entry_additive_gain",
transform_pnl_key="entry_additive_transform_pnl",
transform_dur_key="entry_additive_transform_duration",
non_finite_key="non_finite_entry_additive",
+ base_factor=base_factor,
)
pnl_target: float,
duration_ratio: float,
params: RewardParams,
+ base_factor: float,
) -> float:
if not _get_bool_param(
params,
pnl_target=pnl_target,
duration_ratio=duration_ratio,
params=params,
- scale_key="exit_additive_scale",
+ scale_key="exit_additive_ratio",
gain_key="exit_additive_gain",
transform_pnl_key="exit_additive_transform_pnl",
transform_dur_key="exit_additive_transform_duration",
non_finite_key="non_finite_exit_additive",
+ base_factor=base_factor,
)
next_duration_ratio: float,
params: RewardParams,
*,
+ base_factor: float,
risk_reward_ratio: float,
+ prev_potential: float,
is_exit: bool = False,
is_entry: bool = False,
- prev_potential: float,
) -> tuple[float, float, float, float, float]:
"""Compute potential-based reward shaping (PBRS) components.
next_duration_ratio,
risk_reward_ratio,
params,
+ base_factor,
)
pbrs_delta = gamma * next_potential - prev_potential
reward_shaping = pbrs_delta
entry_additive = 0.0
exit_additive = 0.0
else:
- cand_entry_add = _compute_entry_additive(next_pnl, pnl_target, next_duration_ratio, params)
+ cand_entry_add = _compute_entry_additive(
+ next_pnl, pnl_target, next_duration_ratio, params, base_factor
+ )
cand_exit_add = _compute_exit_additive(
- current_pnl, pnl_target, current_duration_ratio, params
+ current_pnl, pnl_target, current_duration_ratio, params, base_factor
)
entry_additive = cand_entry_add if is_entry else 0.0
exit_additive = cand_exit_add if is_exit else 0.0
next_duration_ratio: float,
params: RewardParams,
*,
+ base_factor: float,
risk_reward_ratio: float,
+ prev_potential: float,
is_exit: bool = False,
is_entry: bool = False,
- prev_potential: float,
) -> tuple[float, float, float, float, float, float]:
"""Compute shaped reward and PBRS diagnostics.
next_pnl,
next_duration_ratio,
params,
+ base_factor=base_factor,
risk_reward_ratio=risk_reward_ratio,
+ prev_potential=prev_potential,
is_exit=is_exit,
is_entry=is_entry,
- prev_potential=prev_potential,
)
)
transform_dur_key: str,
non_finite_key: str,
*,
+ base_factor: float,
risk_reward_ratio: Optional[float] = None,
) -> float:
"""Generic helper for (pnl, duration) bi-component transforms."""
pnl_ratio = float(pnl / pnl_target)
duration_ratio = float(np.clip(duration_ratio, 0.0, 1.0))
- scale = _get_float_param(params, scale_key, 1.0)
+ ratio = _get_float_param(params, scale_key, 0.25 if "hold" in scale_key else 0.125)
+ scale = ratio * base_factor
gain = _get_float_param(params, gain_key, 1.0)
transform_pnl = _get_str_param(params, transform_pnl_key, "tanh")
transform_duration = _get_str_param(params, transform_dur_key, "tanh")
base_factor = _get_float_param(params, "base_factor", float(args.base_factor))
profit_aim = _get_float_param(params, "profit_aim", float(args.profit_aim))
- risk_reward_ratio = _get_float_param(params, "risk_reward_ratio", float(args.risk_reward_ratio))
+ risk_reward_ratio = _get_float_param(
+ params,
+ "risk_reward_ratio",
+ _get_float_param(params, "rr", float(args.risk_reward_ratio)),
+ )
cli_action_masking = _to_bool(args.action_masking)
if "action_masking" in params:
**Setup:**
- Base configuration: hold_potential enabled, additives disabled
- Test configuration: entry_additive and exit_additive enabled
- - Additive parameters: scale=0.4, gain=1.0 for both entry/exit
+ - Additive parameters: ratio=0.4, gain=1.0 for both entry/exit
- Context: base_reward=0.05, pnl=0.01, duration_ratio=0.2
**Assertions:**
{
"entry_additive_enabled": True,
"exit_additive_enabled": True,
- "entry_additive_scale": 0.4,
- "exit_additive_scale": 0.4,
+ "entry_additive_ratio": 0.4,
+ "exit_additive_ratio": 0.4,
"entry_additive_gain": 1.0,
"exit_additive_gain": 1.0,
}
"is_exit": False,
}
s0, _n0, _pbrs0, _entry0, _exit0 = compute_pbrs_components(
- prev_potential=0.0, params=base, **ctx
+ params=base,
+ base_factor=PARAMS.BASE_FACTOR,
+ prev_potential=0.0,
+ **ctx,
)
t0 = base_reward + s0 + _entry0 + _exit0
s1, _n1, _pbrs1, _entry1, _exit1 = compute_pbrs_components(
- prev_potential=0.0, params=with_add, **ctx
+ params=with_add,
+ base_factor=PARAMS.BASE_FACTOR,
+ prev_potential=0.0,
+ **ctx,
)
t1 = base_reward + s1 + _entry1 + _exit1
self.assertFinite(t1)
import pytest
from reward_space_analysis import (
+ DEFAULT_IDLE_DURATION_MULTIPLIER,
Actions,
Positions,
_compute_efficiency_coefficient,
_get_exit_factor,
_get_float_param,
calculate_reward,
+ get_max_idle_duration_candles,
)
from ..constants import PARAMS, SCENARIOS, TOLERANCE
"""Test hold potential computation returns finite values."""
params = {
"hold_potential_enabled": True,
- "hold_potential_scale": 1.0,
+ "hold_potential_ratio": 1.0,
"hold_potential_gain": 1.0,
"hold_potential_transform_pnl": "tanh",
"hold_potential_transform_duration": "tanh",
0.3,
PARAMS.RISK_REWARD_RATIO,
params,
+ PARAMS.BASE_FACTOR,
)
self.assertFinite(val, name="hold_potential")
- penalty(duration=40) ≈ 2 × penalty(duration=20)
- Proportional scaling with idle duration
"""
- params = self.base_params(max_idle_duration_candles=None, max_trade_duration_candles=100)
base_factor = PARAMS.BASE_FACTOR
profit_aim = PARAMS.PROFIT_AIM
risk_reward_ratio = 1.0
+ max_trade_duration_candles = 100
+ params = self.base_params(
+ max_idle_duration_candles=None,
+ max_trade_duration_candles=max_trade_duration_candles,
+ base_factor=base_factor,
+ )
+ expected_max_idle_duration_candles = int(
+ DEFAULT_IDLE_DURATION_MULTIPLIER * max_trade_duration_candles
+ )
+ self.assertEqual(
+ get_max_idle_duration_candles(params),
+ expected_max_idle_duration_candles,
+ "Expected fallback max_idle_duration from max_trade_duration",
+ )
base_context_kwargs = {
"pnl": 0.0,
if ratio is not None:
self.assertAlmostEqualFloat(abs(ratio), 2.0, tolerance=0.2)
- idle_penalty_scale = _get_float_param(params, "idle_penalty_scale", 0.5)
+ idle_penalty_scale = _get_float_param(params, "idle_penalty_scale", 1.0)
idle_penalty_power = _get_float_param(params, "idle_penalty_power", 1.025)
- base_factor = _get_float_param(params, "base_factor", float(base_factor))
- risk_reward_ratio = _get_float_param(params, "risk_reward_ratio", float(risk_reward_ratio))
- idle_factor = base_factor * (profit_aim / risk_reward_ratio) / 4.0
+ idle_factor = base_factor * (profit_aim / risk_reward_ratio)
observed_ratio = abs(br_mid.idle_penalty) / (idle_factor * idle_penalty_scale)
if observed_ratio > 0:
implied_max_idle_duration_candles = 120 / observed_ratio ** (1 / idle_penalty_power)
- self.assertAlmostEqualFloat(implied_max_idle_duration_candles, 400.0, tolerance=20.0)
+ tolerance = 0.05 * expected_max_idle_duration_candles
+ self.assertAlmostEqualFloat(
+ implied_max_idle_duration_candles,
+ float(expected_max_idle_duration_candles),
+ tolerance=tolerance,
+ )
# Owns invariant: components-pbrs-breakdown-fields-119
def test_pbrs_breakdown_fields_finite_and_aligned(self):
msg="invariance_correction should be ~0 in canonical mode",
)
+ def test_rr_alias_matches_risk_reward_ratio(self):
+ """`rr` param alias matches `risk_reward_ratio` runtime naming."""
+ context = self.make_ctx(
+ pnl=0.02,
+ trade_duration=40,
+ idle_duration=0,
+ max_unrealized_profit=0.03,
+ min_unrealized_profit=0.01,
+ position=Positions.Long,
+ action=Actions.Long_exit,
+ )
+ rr_value = 1.75
+
+ # Canonical spelling
+ params_ratio = self.base_params(
+ exit_potential_mode="canonical",
+ risk_reward_ratio=rr_value,
+ )
+ params_ratio.pop("rr", None)
+
+ # Runtime spelling
+ params_rr = self.base_params(
+ exit_potential_mode="canonical",
+ rr=rr_value,
+ )
+ params_rr.pop("risk_reward_ratio", None)
+
+ br_ratio = calculate_reward(
+ context,
+ params_ratio,
+ base_factor=PARAMS.BASE_FACTOR,
+ profit_aim=PARAMS.PROFIT_AIM,
+ risk_reward_ratio=1.0,
+ short_allowed=True,
+ action_masking=True,
+ )
+ br_rr = calculate_reward(
+ context,
+ params_rr,
+ base_factor=PARAMS.BASE_FACTOR,
+ profit_aim=PARAMS.PROFIT_AIM,
+ risk_reward_ratio=1.0,
+ short_allowed=True,
+ action_masking=True,
+ )
+
+ self.assertAlmostEqualFloat(
+ br_rr.total,
+ br_ratio.total,
+ tolerance=TOLERANCE.IDENTITY_STRICT,
+ msg="Total reward should match when using rr alias",
+ )
+ self.assertAlmostEqualFloat(
+ br_rr.exit_component,
+ br_ratio.exit_component,
+ tolerance=TOLERANCE.IDENTITY_STRICT,
+ msg="Exit component should match when using rr alias",
+ )
+
if __name__ == "__main__":
unittest.main()
Attributes:
TERMINAL_TOL: Terminal potential must be within this tolerance of zero (1e-09)
- MAX_ABS_SHAPING: Maximum absolute shaping value for bounded checks (10.0)
+ MAX_ABS_SHAPING: Maximum absolute shaping value for bounded checks (50.0)
TERMINAL_PROBABILITY: Default probability of terminal state in sweeps (0.08)
"""
TERMINAL_TOL: float = 1e-09
- MAX_ABS_SHAPING: float = 10.0
+ MAX_ABS_SHAPING: float = 50.0
TERMINAL_PROBABILITY: float = 0.08
TRADE_DURATION_LONG: Long trade duration in steps (200)
# Common additive parameters
- ADDITIVE_SCALE_DEFAULT: Default additive scale factor (0.4)
+ ADDITIVE_RATIO_DEFAULT: Default additive ratio (0.4)
ADDITIVE_GAIN_DEFAULT: Default additive gain (1.0)
"""
TRADE_DURATION_LONG: int = 200
# Additive parameters
- ADDITIVE_SCALE_DEFAULT: float = 0.4
+ ADDITIVE_RATIO_DEFAULT: float = 0.4
ADDITIVE_GAIN_DEFAULT: float = 1.0
current_dur,
PARAMS.RISK_REWARD_RATIO,
params,
+ PARAMS.BASE_FACTOR,
)
(
_total_reward,
current_duration_ratio=current_dur,
next_pnl=0.0,
next_duration_ratio=0.0,
+ base_factor=PARAMS.BASE_FACTOR,
risk_reward_ratio=PARAMS.RISK_REWARD_RATIO,
is_exit=True,
is_entry=False,
current_dur,
PARAMS.RISK_REWARD_RATIO,
params,
+ PARAMS.BASE_FACTOR,
)
+
gamma = _get_float_param(
params, "potential_gamma", DEFAULT_MODEL_REWARD_PARAMETERS.get("potential_gamma", 0.95)
)
current_duration_ratio=current_dur,
next_pnl=0.0,
next_duration_ratio=0.0,
+ base_factor=PARAMS.BASE_FACTOR,
risk_reward_ratio=PARAMS.RISK_REWARD_RATIO,
is_exit=True,
is_entry=False,
def test_additive_components_disabled_return_zero(self):
"""Verifies entry/exit additives return zero when disabled."""
- params_entry = {"entry_additive_enabled": False, "entry_additive_scale": 1.0}
+ params_entry = {"entry_additive_enabled": False, "entry_additive_ratio": 1.0}
val_entry = _compute_entry_additive(
- 0.5, PARAMS.PROFIT_AIM * PARAMS.RISK_REWARD_RATIO, 0.3, params_entry
+ 0.5,
+ PARAMS.PROFIT_AIM * PARAMS.RISK_REWARD_RATIO,
+ 0.3,
+ params_entry,
+ PARAMS.BASE_FACTOR,
)
self.assertEqual(float(val_entry), 0.0)
- params_exit = {"exit_additive_enabled": False, "exit_additive_scale": 1.0}
+ params_exit = {"exit_additive_enabled": False, "exit_additive_ratio": 1.0}
val_exit = _compute_exit_additive(
- 0.5, PARAMS.PROFIT_AIM * PARAMS.RISK_REWARD_RATIO, 0.3, params_exit
+ 0.5,
+ PARAMS.PROFIT_AIM * PARAMS.RISK_REWARD_RATIO,
+ 0.3,
+ params_exit,
+ PARAMS.BASE_FACTOR,
)
self.assertEqual(float(val_exit), 0.0)
current_duration_ratio=0.0,
next_pnl=0.01,
next_duration_ratio=0.0,
+ base_factor=PARAMS.BASE_FACTOR,
risk_reward_ratio=PARAMS.RISK_REWARD_RATIO,
is_exit=False,
is_entry=True,
current_duration_ratio=0.4,
next_pnl=0.02,
next_duration_ratio=0.41,
+ base_factor=PARAMS.BASE_FACTOR,
risk_reward_ratio=PARAMS.RISK_REWARD_RATIO,
is_exit=False,
is_entry=False,
next_pnl=next_pnl,
next_duration_ratio=next_duration_ratio,
risk_reward_ratio=PARAMS.RISK_REWARD_RATIO,
+ base_factor=PARAMS.BASE_FACTOR,
is_exit=True,
is_entry=False,
prev_potential=0.789,
hold_potential_enabled=True,
entry_additive_enabled=True,
exit_additive_enabled=True,
- entry_additive_scale=10.0,
- exit_additive_scale=10.0,
+ entry_additive_ratio=10.0,
+ exit_additive_ratio=10.0,
)
(
current_duration_ratio=0.0,
next_pnl=0.02,
next_duration_ratio=0.0,
+ base_factor=PARAMS.BASE_FACTOR,
risk_reward_ratio=PARAMS.RISK_REWARD_RATIO,
is_exit=False,
is_entry=True,
current_dur,
PARAMS.RISK_REWARD_RATIO,
params,
+ PARAMS.BASE_FACTOR,
)
(
current_duration_ratio=current_dur,
next_pnl=0.0,
next_duration_ratio=0.0,
+ base_factor=PARAMS.BASE_FACTOR,
risk_reward_ratio=PARAMS.RISK_REWARD_RATIO,
is_exit=True,
is_entry=False,
next_pnl=0.0,
next_duration_ratio=0.0,
risk_reward_ratio=PARAMS.RISK_REWARD_RATIO,
+ base_factor=PARAMS.BASE_FACTOR,
is_exit=True,
prev_potential=prev_potential,
params=params,
current_duration_ratio=0.2,
next_pnl=0.035,
next_duration_ratio=0.25,
+ base_factor=PARAMS.BASE_FACTOR,
risk_reward_ratio=PARAMS.RISK_REWARD_RATIO,
is_exit=False,
prev_potential=0.0,
current_duration_ratio=0.2,
next_pnl=0.035,
next_duration_ratio=0.25,
+ base_factor=PARAMS.BASE_FACTOR,
risk_reward_ratio=PARAMS.RISK_REWARD_RATIO,
is_exit=False,
prev_potential=0.0,
entry_additive_enabled=False,
exit_additive_enabled=False,
potential_gamma=0.9,
+ base_factor=PARAMS.BASE_FACTOR,
)
trade_duration = 5
duration_ratio=(trade_duration / max_trade_duration_candles),
risk_reward_ratio=PARAMS.RISK_REWARD_RATIO,
params=params,
+ base_factor=PARAMS.BASE_FACTOR,
)
self.assertAlmostEqualFloat(
breakdown.next_potential,
"""Batch validate strict failures + relaxed multi-reason aggregation via helpers."""
strict_failures = [
build_validation_case({"potential_gamma": -0.2}, strict=True, expect_error=True),
- build_validation_case({"hold_potential_scale": -5.0}, strict=True, expect_error=True),
+ build_validation_case({"hold_potential_ratio": -5.0}, strict=True, expect_error=True),
]
success_case = build_validation_case({}, strict=True, expect_error=False)
relaxed_case = build_validation_case(
{
"potential_gamma": "not-a-number",
- "hold_potential_scale": "-5.0",
+ "hold_potential_ratio": "-5.0",
"max_idle_duration_candles": "nan",
},
strict=False,
params_relaxed.update(
{
"potential_gamma": "not-a-number",
- "hold_potential_scale": "-5.0",
+ "hold_potential_ratio": "-5.0",
"max_idle_duration_candles": "nan",
}
)
params_relaxed,
{
"potential_gamma": ["non_numeric_reset"],
- "hold_potential_scale": ["numeric_coerce", "min="],
+ "hold_potential_ratio": ["numeric_coerce", "min="],
"max_idle_duration_candles": ["derived_default"],
},
)
potential_gamma=gamma,
entry_additive_enabled=False,
exit_additive_enabled=False,
- hold_potential_scale=1.0,
+ hold_potential_ratio=1.0,
)
ctx_pnl = 0.012
ctx_dur_ratio = 0.3
ctx_dur_ratio,
PARAMS.RISK_REWARD_RATIO,
params_can,
+ PARAMS.BASE_FACTOR,
)
self.assertFinite(prev_phi, name="prev_phi")
next_phi_can = _compute_exit_potential(prev_phi, params_can)
potential_gamma=0.94,
)
prev_potential = 0.42
+ current_pnl = 0.02
+ current_dur = 0.5
+ profit_aim = PARAMS.PROFIT_AIM
(
_total_reward,
reward_shaping,
_exit_additive,
) = apply_potential_shaping(
base_reward=0.0,
- current_pnl=0.012,
- pnl_target=PARAMS.PROFIT_AIM * PARAMS.RISK_REWARD_RATIO,
- current_duration_ratio=0.3,
+ current_pnl=current_pnl,
+ pnl_target=profit_aim * PARAMS.RISK_REWARD_RATIO,
+ current_duration_ratio=current_dur,
next_pnl=0.0,
next_duration_ratio=0.0,
+ base_factor=PARAMS.BASE_FACTOR,
risk_reward_ratio=PARAMS.RISK_REWARD_RATIO,
is_exit=True,
is_entry=False,
entry_additive_enabled=False,
exit_additive_enabled=False,
potential_gamma=0.9,
+ base_factor=PARAMS.BASE_FACTOR,
)
pnl_target = PARAMS.PROFIT_AIM * PARAMS.RISK_REWARD_RATIO
ctx = self.make_ctx(
current_duration_ratio,
PARAMS.RISK_REWARD_RATIO,
params,
+ PARAMS.BASE_FACTOR,
)
+
self.assertNotEqual(prev_potential, 0.0)
breakdown = calculate_reward(
next_pnl=0.025,
next_duration_ratio=0.35,
risk_reward_ratio=PARAMS.RISK_REWARD_RATIO,
+ base_factor=PARAMS.BASE_FACTOR,
is_exit=False,
prev_potential=0.0,
params=params,
next_pnl=next_pnl,
next_duration_ratio=next_dur,
risk_reward_ratio=PARAMS.RISK_REWARD_RATIO,
+ base_factor=PARAMS.BASE_FACTOR,
is_exit=is_exit,
prev_potential=prev_potential,
params=params,
next_pnl=next_pnl,
next_duration_ratio=next_dur,
risk_reward_ratio=PARAMS.RISK_REWARD_RATIO,
+ base_factor=PARAMS.BASE_FACTOR,
is_exit=is_exit,
prev_potential=prev_potential,
params=params,
self.assertGreater(
abs(shaping_sum),
PBRS_INVARIANCE_TOL * 50,
- f"Expected non-zero Σ shaping (got {shaping_sum})",
+ f"Expected non-zero shaping (got {shaping_sum})",
)
# Non-owning smoke; ownership: robustness/test_robustness.py:35 (robustness-decomposition-integrity-101)
PBRS_INTEGRATION_PARAMS = [
"potential_gamma",
"hold_potential_enabled",
- "hold_potential_scale",
+ "hold_potential_ratio",
"entry_additive_enabled",
"exit_additive_enabled",
]
current_duration_ratio=current_dur,
next_pnl=next_pnl,
next_duration_ratio=next_dur,
+ base_factor=PARAMS.BASE_FACTOR,
risk_reward_ratio=PARAMS.RISK_REWARD_RATIO,
+ prev_potential=prev_potential,
is_exit=is_exit,
is_entry=False,
- prev_potential=prev_potential,
params=params,
)
)
DEFAULT_EXIT_POTENTIAL_DECAY: Final[float] = 0.5
DEFAULT_ENTRY_ADDITIVE_ENABLED: Final[bool] = False
- DEFAULT_ENTRY_ADDITIVE_SCALE: Final[float] = 1.0
+ DEFAULT_ENTRY_ADDITIVE_RATIO: Final[float] = 0.125
DEFAULT_ENTRY_ADDITIVE_GAIN: Final[float] = 1.0
DEFAULT_HOLD_POTENTIAL_ENABLED: Final[bool] = True
- DEFAULT_HOLD_POTENTIAL_SCALE: Final[float] = 1.0
+ DEFAULT_HOLD_POTENTIAL_RATIO: Final[float] = 0.25
DEFAULT_HOLD_POTENTIAL_GAIN: Final[float] = 1.0
DEFAULT_EXIT_ADDITIVE_ENABLED: Final[bool] = False
- DEFAULT_EXIT_ADDITIVE_SCALE: Final[float] = 1.0
+ DEFAULT_EXIT_ADDITIVE_RATIO: Final[float] = 0.125
DEFAULT_EXIT_ADDITIVE_GAIN: Final[float] = 1.0
DEFAULT_EXIT_PLATEAU: Final[bool] = True
DEFAULT_EFFICIENCY_CENTER: Final[float] = 0.5
DEFAULT_INVALID_ACTION: Final[float] = -2.0
- DEFAULT_IDLE_PENALTY_SCALE: Final[float] = 0.5
+ DEFAULT_IDLE_PENALTY_SCALE: Final[float] = 1.0
DEFAULT_IDLE_PENALTY_POWER: Final[float] = 1.025
- DEFAULT_HOLD_PENALTY_SCALE: Final[float] = 0.25
+ DEFAULT_HOLD_PENALTY_SCALE: Final[float] = 1.0
DEFAULT_HOLD_PENALTY_POWER: Final[float] = 1.025
DEFAULT_CHECK_INVARIANTS: Final[bool] = True
"entry_additive_enabled", ReforceXY.DEFAULT_ENTRY_ADDITIVE_ENABLED
)
)
- self._entry_additive_scale: float = float(
+ self._entry_additive_ratio: float = float(
model_reward_parameters.get(
- "entry_additive_scale", ReforceXY.DEFAULT_ENTRY_ADDITIVE_SCALE
+ "entry_additive_ratio", ReforceXY.DEFAULT_ENTRY_ADDITIVE_RATIO
)
)
self._entry_additive_gain: float = float(
"hold_potential_enabled", ReforceXY.DEFAULT_HOLD_POTENTIAL_ENABLED
)
)
- self._hold_potential_scale: float = float(
+ self._hold_potential_ratio: float = float(
model_reward_parameters.get(
- "hold_potential_scale", ReforceXY.DEFAULT_HOLD_POTENTIAL_SCALE
+ "hold_potential_ratio", ReforceXY.DEFAULT_HOLD_POTENTIAL_RATIO
)
)
self._hold_potential_gain: float = float(
"exit_additive_enabled", ReforceXY.DEFAULT_EXIT_ADDITIVE_ENABLED
)
)
- self._exit_additive_scale: float = float(
+ self._exit_additive_ratio: float = float(
model_reward_parameters.get(
- "exit_additive_scale", ReforceXY.DEFAULT_EXIT_ADDITIVE_SCALE
+ "exit_additive_ratio", ReforceXY.DEFAULT_EXIT_ADDITIVE_RATIO
)
)
self._exit_additive_gain: float = float(
duration_ratio: float,
pnl: float,
pnl_target: float,
+ scale: float,
) -> float:
"""Compute PBRS potential Φ(s) for position holding states.
pnl=pnl,
pnl_target=pnl_target,
duration_ratio=duration_ratio,
- scale=self._hold_potential_scale,
+ scale=scale,
gain=self._hold_potential_gain,
transform_pnl=self._hold_potential_transform_pnl,
transform_duration=self._hold_potential_transform_duration,
pnl: float,
pnl_target: float,
duration_ratio: float,
+ scale: float,
) -> float:
"""Compute exit additive reward for position exit transitions.
pnl=pnl,
pnl_target=pnl_target,
duration_ratio=duration_ratio,
- scale=self._exit_additive_scale,
+ scale=scale,
gain=self._exit_additive_gain,
transform_pnl=self._exit_additive_transform_pnl,
transform_duration=self._exit_additive_transform_duration,
pnl: float,
pnl_target: float,
duration_ratio: float,
+ scale: float,
) -> float:
"""Compute entry additive reward for position entry transitions.
pnl=pnl,
pnl_target=pnl_target,
duration_ratio=duration_ratio,
- scale=self._entry_additive_scale,
+ scale=scale,
gain=self._entry_additive_gain,
transform_pnl=self._entry_additive_transform_pnl,
transform_duration=self._entry_additive_transform_duration,
max_trade_duration: float,
pnl: float,
pnl_target: float,
+ hold_potential_scale: float,
+ entry_additive_scale: float,
+ exit_additive_scale: float,
) -> tuple[float, float, float]:
"""Compute potential-based reward shaping (PBRS) components.
**State Variables:**
r_pnl : pnl / pnl_target (PnL ratio)
r_dur : duration / max_duration (duration ratio, clamp [0,1])
+ scale : scale parameter
g : gain parameter
T_x : transform function (tanh, softsign, etc.)
Current position PnL (for current state s)
pnl_target : float
Target PnL for ratio normalization: r_pnl = pnl / pnl_target
+ hold_potential_scale : float
+ Magnitude scale for hold potential (= hold_potential_ratio * base_factor)
+ entry_additive_scale : float
+ Magnitude scale for entry additive (= entry_additive_ratio * base_factor)
+ exit_additive_scale : float
+ Magnitude scale for exit additive (= exit_additive_ratio * base_factor)
Returns
-------
if is_entry or is_hold:
if self._hold_potential_enabled:
next_potential = self._compute_hold_potential(
- next_position, next_duration_ratio, next_pnl, pnl_target
+ next_position,
+ next_duration_ratio,
+ next_pnl,
+ pnl_target,
+ hold_potential_scale,
)
reward_shaping = gamma * next_potential - prev_potential
else:
and not self.is_pbrs_invariant_mode()
):
entry_additive = self._compute_entry_additive(
- pnl=next_pnl,
- pnl_target=pnl_target,
- duration_ratio=next_duration_ratio,
+ next_pnl,
+ pnl_target,
+ next_duration_ratio,
+ entry_additive_scale,
)
self._total_entry_additive += float(entry_additive)
if self._exit_additive_enabled and not self.is_pbrs_invariant_mode():
duration_ratio = trade_duration / max(max_trade_duration, 1)
exit_additive = self._compute_exit_additive(
- pnl, pnl_target, duration_ratio
+ pnl, pnl_target, duration_ratio, exit_additive_scale
)
self._total_exit_additive += float(exit_additive)
model_reward_parameters: Mapping[str, Any],
) -> float:
"""
- Compute exit factor: base_factor × time_attenuation_coefficient x pnl_target_coefficient x efficiency_coefficient.
+ Compute exit factor: base_factor × time_attenuation_coefficient × pnl_target_coefficient × efficiency_coefficient.
"""
if not (
np.isfinite(base_factor)
base_factor = float(
model_reward_parameters.get("base_factor", ReforceXY.DEFAULT_BASE_FACTOR)
)
- idle_factor = base_factor * (self.profit_aim / self.rr) / 4.0
+ idle_factor = base_factor * (self.profit_aim / self.rr)
hold_factor = idle_factor
# 2. Idle penalty
self._last_hold_penalty = float(base_reward)
# 4. Exit rewards
- pnl = self.get_unrealized_profit()
+ pnl: float = self.get_unrealized_profit()
if (
base_reward is None
and action == Actions.Long_exit.value
base_reward = 0.0
# 6. Potential-based reward shaping
+ hold_potential_scale = self._hold_potential_ratio * base_factor
+ entry_additive_scale = self._entry_additive_ratio * base_factor
+ exit_additive_scale = self._exit_additive_ratio * base_factor
+
reward_shaping, entry_additive, exit_additive = self._compute_pbrs_components(
action=action,
trade_duration=trade_duration,
max_trade_duration=max_trade_duration,
pnl=pnl,
pnl_target=self._pnl_target,
+ hold_potential_scale=hold_potential_scale,
+ entry_additive_scale=entry_additive_scale,
+ exit_additive_scale=exit_additive_scale,
)
return base_reward + reward_shaping + entry_additive + exit_additive