- Maximum trade duration in candles (from environment config)
- Should match your actual trading environment setting
-- Also used as fallback for `max_idle_duration_candles` when that tunable is ≤ 0 (idle penalty grace behaviour)
+- Drives idle grace: when `max_idle_duration_candles` ≤ 0 the fallback = `2 * max_trade_duration`
### Reward Configuration
# Idle penalty (env defaults)
"idle_penalty_power": 1.0,
"idle_penalty_scale": 0.75,
- # If <=0 or unset, falls back to max_trade_duration_candles at runtime
+ # Fallback semantics: if <=0 or unset → 2 * max_trade_duration_candles (grace window before full idle penalty)
"max_idle_duration_candles": 0,
# Holding keys (env defaults)
"holding_penalty_scale": 0.5,
"base_factor": "Base reward factor used inside the environment.",
"idle_penalty_power": "Power applied to idle penalty scaling.",
"idle_penalty_scale": "Scale of idle penalty.",
- "max_idle_duration_candles": "Maximum idle duration candles before full idle penalty scaling; 0 = use max_trade_duration_candles.",
+ "max_idle_duration_candles": "Maximum idle duration candles before full idle penalty scaling; 0 = use 2 * max_trade_duration_candles.",
"holding_penalty_scale": "Scale of holding penalty.",
"holding_penalty_power": "Power applied to holding penalty scaling.",
"exit_factor_mode": "Time attenuation mode for exit factor.",
"""Mirror the environment's idle penalty behaviour."""
idle_penalty_scale = _get_param_float(params, "idle_penalty_scale", 0.75)
idle_penalty_power = _get_param_float(params, "idle_penalty_power", 1.0)
- max_trade_duration = int(params.get("max_trade_duration_candles", 128))
+ max_trade_duration_candles = params.get("max_trade_duration_candles", 128)
max_idle_duration_candles = params.get("max_idle_duration_candles")
try:
max_idle_duration = (
int(max_idle_duration_candles)
if max_idle_duration_candles is not None
- else max_trade_duration
+ else 2 * max_trade_duration_candles
)
except (TypeError, ValueError):
- max_idle_duration = max_trade_duration
+ max_idle_duration = max_trade_duration_candles
if max_idle_duration <= 0:
- max_idle_duration = max_trade_duration
+ max_idle_duration = 2 * max_trade_duration_candles
idle_duration_ratio = context.idle_duration / max(1, max_idle_duration)
return -idle_factor * idle_penalty_scale * idle_duration_ratio**idle_penalty_power
)
def test_max_idle_duration_candles_logic(self):
- """Idle penalty scaling test with explicit max_idle_duration_candles (no force-action comparisons)."""
+ """Idle penalty scaling test with explicit max_idle_duration_candles."""
params_small = self.DEFAULT_PARAMS.copy()
params_large = self.DEFAULT_PARAMS.copy()
# Activate explicit max idle durations
)
def test_idle_penalty_fallback_and_proportionality(self):
- """When max_idle_duration_candles <= 0, fallback to max_trade_duration and ensure proportional scaling.
+ """Fallback & proportionality validation.
- Also validates that penalty doubles (approximately) when idle_duration doubles (holding other params constant).
+ Semantics:
+ - When max_idle_duration_candles <= 0, fallback must be 2 * max_trade_duration (updated rule).
+ - Idle penalty scales ~ linearly with idle_duration (power=1), so doubling idle_duration doubles penalty magnitude.
+ - We also infer the implicit denominator from a mid-range idle duration (>1x and <2x trade duration) to ensure the
+ 2x fallback.
"""
params = self.DEFAULT_PARAMS.copy()
params["max_idle_duration_candles"] = 0 # force fallback
tolerance=0.2,
msg=f"Idle penalty proportionality mismatch (ratio={ratio})",
)
+ # Additional mid-range inference check (idle_duration between 1x and 2x trade duration)
+ ctx_mid = dataclasses.replace(ctx_a, idle_duration=120, max_trade_duration=100)
+ br_mid = calculate_reward(
+ ctx_mid,
+ params,
+ base_factor=base_factor,
+ profit_target=profit_target,
+ risk_reward_ratio=risk_reward_ratio,
+ short_allowed=True,
+ action_masking=True,
+ )
+ self.assertLess(br_mid.idle_penalty, 0.0)
+ idle_penalty_scale = float(params.get("idle_penalty_scale", 0.75))
+ idle_penalty_power = float(params.get("idle_penalty_power", 1.0))
+ # Internal factor may come from params (overrides provided base_factor argument)
+ factor_used = float(params.get("base_factor", base_factor))
+ idle_factor = factor_used * (profit_target * risk_reward_ratio) / 3.0
+ observed_ratio = abs(br_mid.idle_penalty) / (idle_factor * idle_penalty_scale)
+ if observed_ratio > 0:
+ implied_D = 120 / (observed_ratio ** (1 / idle_penalty_power))
+ self.assertAlmostEqualFloat(
+ implied_D,
+ 200.0,
+ tolerance=12.0, # modest tolerance for float ops / rounding
+ msg=f"Fallback denominator mismatch (implied={implied_D}, expected≈200, factor_used={factor_used})",
+ )
def test_exit_factor_threshold_warning_non_capping(self):
"""Ensure exit_factor_threshold does not cap the exit factor (warning-only semantics).
duration_ratio = 0.0
model_reward_parameters = self.rl_config.get("model_reward_parameters", {})
- exit_factor_mode = str(
- model_reward_parameters.get("exit_factor_mode", "piecewise")
- ).lower()
+ exit_factor_mode = model_reward_parameters.get("exit_factor_mode", "piecewise")
def _legacy(f: float, dr: float, p: Mapping) -> float:
return f * (1.5 if dr <= 1.0 else 0.5)
if action == Actions.Neutral.value and self._position == Positions.Neutral:
max_idle_duration = int(
model_reward_parameters.get(
- "max_idle_duration_candles", max_trade_duration
+ "max_idle_duration_candles", 2 * max_trade_duration
)
)
if max_idle_duration <= 0:
- max_idle_duration = max_trade_duration
+ max_idle_duration = 2 * max_trade_duration
idle_penalty_scale = float(
model_reward_parameters.get("idle_penalty_scale", 0.75)
)