```shell
uv run python reward_space_analysis.py \
--num_samples 30000 \
- --params win_reward_factor=4.0 idle_penalty_scale=1.5 \
+ --params win_reward_factor=4.0 idle_penalty_ratio=1.5 \
--out_dir sensitivity_test
```
| ---------------------------- | ------- | -------------------------- |
| `max_trade_duration_candles` | 128 | Trade duration cap |
| `max_idle_duration_candles` | None | Fallback 4× trade duration |
-| `idle_penalty_scale` | 1.0 | Idle penalty scale |
+| `idle_penalty_ratio` | 1.0 | Idle penalty ratio |
| `idle_penalty_power` | 1.025 | Idle penalty exponent |
-| `hold_penalty_scale` | 1.0 | Hold penalty scale |
+| `hold_penalty_ratio` | 1.0 | Hold penalty ratio |
| `hold_penalty_power` | 1.025 | Hold penalty exponent |
#### Validation
bulk `--params` values.
```shell
-uv run python reward_space_analysis.py --win_reward_factor 3.0 --idle_penalty_scale 2.0 --num_samples 15000
-uv run python reward_space_analysis.py --params win_reward_factor=3.0 idle_penalty_scale=2.0 --num_samples 15000
+uv run python reward_space_analysis.py --win_reward_factor 3.0 --idle_penalty_ratio 2.0 --num_samples 15000
+uv run python reward_space_analysis.py --params win_reward_factor=3.0 idle_penalty_ratio=2.0 --num_samples 15000
```
`--params` wins on conflicts.
"invalid_action": -2.0,
"base_factor": 100.0,
# Idle penalty defaults
- "idle_penalty_scale": 1.0,
+ "idle_penalty_ratio": 1.0,
"idle_penalty_power": 1.025,
"max_trade_duration_candles": 128,
# Fallback: DEFAULT_IDLE_DURATION_MULTIPLIER * max_trade_duration_candles
"max_idle_duration_candles": None,
# Hold penalty defaults
- "hold_penalty_scale": 1.0,
+ "hold_penalty_ratio": 1.0,
"hold_penalty_power": 1.025,
# Exit attenuation defaults
"exit_attenuation_mode": "linear",
"invalid_action": "Penalty for invalid actions",
"base_factor": "Base reward scale",
"idle_penalty_power": "Idle penalty exponent",
- "idle_penalty_scale": "Idle penalty scale",
+ "idle_penalty_ratio": "Idle penalty ratio",
"max_trade_duration_candles": "Trade duration cap (candles)",
"max_idle_duration_candles": "Idle duration cap (candles)",
- "hold_penalty_scale": "Hold penalty scale",
+ "hold_penalty_ratio": "Hold penalty ratio",
"hold_penalty_power": "Hold penalty exponent",
"exit_attenuation_mode": "Exit kernel (legacy|sqrt|linear|power|half_life)",
"exit_plateau": "Use plateau before attenuation",
"invalid_action": {"max": 0.0}, # penalty should be <= 0
"base_factor": {"min": 0.0},
"idle_penalty_power": {"min": 0.0},
- "idle_penalty_scale": {"min": 0.0},
+ "idle_penalty_ratio": {"min": 0.0},
"max_trade_duration_candles": {"min": 1.0},
"max_idle_duration_candles": {"min": 0.0},
- "hold_penalty_scale": {"min": 0.0},
+ "hold_penalty_ratio": {"min": 0.0},
"hold_penalty_power": {"min": 0.0},
"exit_linear_slope": {"min": 0.0},
"exit_plateau_grace": {"min": 0.0},
"""Dynamically add CLI options for each tunable in DEFAULT_MODEL_REWARD_PARAMETERS.
Rules:
- - Use the same underscored names as option flags (e.g., --idle_penalty_scale).
+ - Use the same underscored names as option flags (e.g., --idle_penalty_ratio).
- Defaults are None so only user-provided values override params.
- For exit_attenuation_mode, enforce allowed choices (case-sensitive).
- Skip keys already managed as top-level options (e.g., base_factor) to avoid duplicates.
def _idle_penalty(context: RewardContext, idle_factor: float, params: RewardParams) -> float:
"""Compute idle penalty."""
- idle_penalty_scale = _get_float_param(
+ idle_penalty_ratio = _get_float_param(
params,
- "idle_penalty_scale",
- DEFAULT_MODEL_REWARD_PARAMETERS.get("idle_penalty_scale", 1.0),
+ "idle_penalty_ratio",
+ DEFAULT_MODEL_REWARD_PARAMETERS.get("idle_penalty_ratio", 1.0),
)
idle_penalty_power = _get_float_param(
params,
)
max_idle_duration_candles = get_max_idle_duration_candles(params)
idle_duration_ratio = context.idle_duration / max(1, max_idle_duration_candles)
- return -idle_factor * idle_penalty_scale * idle_duration_ratio**idle_penalty_power
+ return -idle_factor * idle_penalty_ratio * idle_duration_ratio**idle_penalty_power
def _hold_penalty(context: RewardContext, hold_factor: float, params: RewardParams) -> float:
"""Compute hold penalty."""
- hold_penalty_scale = _get_float_param(
+ hold_penalty_ratio = _get_float_param(
params,
- "hold_penalty_scale",
- DEFAULT_MODEL_REWARD_PARAMETERS.get("hold_penalty_scale", 1.0),
+ "hold_penalty_ratio",
+ DEFAULT_MODEL_REWARD_PARAMETERS.get("hold_penalty_ratio", 1.0),
)
hold_penalty_power = _get_float_param(
params,
if duration_ratio < 1.0:
return _fail_safely("hold_penalty_duration_ratio_lt_1")
- return -hold_factor * hold_penalty_scale * (duration_ratio - 1.0) ** hold_penalty_power
+ return -hold_factor * hold_penalty_ratio * (duration_ratio - 1.0) ** hold_penalty_power
def _compute_exit_reward(
nargs="*",
default=[],
metavar="KEY=VALUE",
- help="Override reward parameters, e.g. hold_penalty_scale=0.5",
+ help="Override reward parameters, e.g. hold_penalty_ratio=0.5",
)
# Dynamically add CLI options for all tunables
add_tunable_cli_args(parser)
DEFAULT_EFFICIENCY_CENTER: Final[float] = 0.5
DEFAULT_INVALID_ACTION: Final[float] = -2.0
- DEFAULT_IDLE_PENALTY_SCALE: Final[float] = 1.0
+ DEFAULT_IDLE_PENALTY_RATIO: Final[float] = 1.0
DEFAULT_IDLE_PENALTY_POWER: Final[float] = 1.025
- DEFAULT_HOLD_PENALTY_SCALE: Final[float] = 1.0
+ DEFAULT_HOLD_PENALTY_RATIO: Final[float] = 1.0
DEFAULT_HOLD_PENALTY_POWER: Final[float] = 1.025
DEFAULT_CHECK_INVARIANTS: Final[bool] = True
and self._position == Positions.Neutral
):
max_idle_duration = max(1, self.max_idle_duration_candles)
- idle_penalty_scale = float(
+ idle_penalty_ratio = float(
model_reward_parameters.get(
- "idle_penalty_scale", ReforceXY.DEFAULT_IDLE_PENALTY_SCALE
+ "idle_penalty_ratio", ReforceXY.DEFAULT_IDLE_PENALTY_RATIO
)
)
idle_penalty_power = float(
idle_duration_ratio = idle_duration / max(1, max_idle_duration)
base_reward = (
-idle_factor
- * idle_penalty_scale
+ * idle_penalty_ratio
* idle_duration_ratio**idle_penalty_power
)
self._last_idle_penalty = float(base_reward)
and self._position in (Positions.Short, Positions.Long)
and action == Actions.Neutral.value
):
- hold_penalty_scale = float(
+ hold_penalty_ratio = float(
model_reward_parameters.get(
- "hold_penalty_scale", ReforceXY.DEFAULT_HOLD_PENALTY_SCALE
+ "hold_penalty_ratio", ReforceXY.DEFAULT_HOLD_PENALTY_RATIO
)
)
hold_penalty_power = float(
else:
base_reward = (
-hold_factor
- * hold_penalty_scale
+ * hold_penalty_ratio
* (duration_ratio - 1.0) ** hold_penalty_power
)
self._last_hold_penalty = float(base_reward)