- Maximum trade duration in candles (from environment config)
- Should match your actual trading environment setting
-- Drives idle grace: when `max_idle_duration_candles` ≤ 0 the fallback = `2 * max_trade_duration`
+- Drives idle grace: when `max_idle_duration_candles` fallback = `2 * max_trade_duration`
### Reward Configuration
import warnings
from enum import Enum, IntEnum
from pathlib import Path
-from typing import Any, Dict, Iterable, List, Optional, Tuple, Union, Mapping
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
import numpy as np
import pandas as pd
def _get_param_float(
- params: Mapping[str, RewardParamValue], key: str, default: RewardParamValue
+ params: RewardParams, key: str, default: RewardParamValue
) -> float:
"""Extract float parameter with type safety and default fallback."""
value = params.get(key, default)
# Idle penalty (env defaults)
"idle_penalty_scale": 0.5,
"idle_penalty_power": 1.025,
- # Fallback semantics: 2 * max_trade_duration_candles
+ # Fallback: 2 * max_trade_duration_candles
"max_idle_duration_candles": None,
# Holding keys (env defaults)
"holding_penalty_scale": 0.25,
if key == "exit_attenuation_mode":
parser.add_argument(
f"--{key}",
- type=str, # case preserved; validation + silent fallback occurs before factor computation
+ type=str,
choices=sorted(ALLOWED_EXIT_MODES),
default=None,
help=help_text,
rng = random.Random(seed)
short_allowed = _is_short_allowed(trading_mode)
action_masking = _to_bool(params.get("action_masking", True))
- samples: list[dict[str, float]] = []
+ samples: list[Dict[str, float]] = []
for _ in range(num_samples):
if short_allowed:
position_choices = [Positions.Neutral, Positions.Long, Positions.Short]
position=Positions.Neutral,
action=Actions.Neutral,
),
- # Holding penalty (maintained position)
+ # Holding penalty
RewardContext(
pnl=0.0,
trade_duration=80,
"trade_duration": np.random.uniform(5, 150, 300),
"idle_duration": idle_duration,
"position": np.random.choice([0.0, 0.5, 1.0], 300),
- "is_force_exit": np.random.choice([0.0, 1.0], 300, p=[0.85, 0.15]),
}
)
"trade_duration": np.random.uniform(5, 150, 300),
"idle_duration": np.random.uniform(0, 100, 300),
"position": np.random.choice([0.0, 0.5, 1.0], 300),
- "is_force_exit": np.random.choice([0.0, 1.0], 300, p=[0.8, 0.2]),
}
)
[np.random.uniform(5, 50, 50), np.zeros(150)]
),
"position": np.random.choice([0.0, 0.5, 1.0], 200),
- "is_force_exit": np.random.choice([0.0, 1.0], 200, p=[0.8, 0.2]),
}
)
"""
_LOG_2 = math.log(2.0)
- _action_masks_cache: Dict[Tuple[bool, int], NDArray[np.bool_]] = {}
+ _action_masks_cache: Dict[Tuple[bool, float], NDArray[np.bool_]] = {}
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
cache_key = (
can_short,
- position.value,
+ float(position.value),
)
if cache_key in ReforceXY._action_masks_cache:
return ReforceXY._action_masks_cache[cache_key]
delta_pnl = pnl - pre_pnl
info = {
"tick": self._current_tick,
- "position": self._position.value,
+ "position": float(self._position.value),
"action": action,
"pre_pnl": round(pre_pnl, 5),
"pnl": round(pnl, 5),