- **`--profit_aim`** (float, default: 0.03) – Profit target threshold (e.g.
0.03=3%).
-- **`--risk_reward_ratio`** (float, default: 1.0) – Risk-reward multiplier.
+- **`--risk_reward_ratio`** (float, default: 2.0) – Risk-reward multiplier.
- **`--action_masking`** (bool, default: true) – Simulate environment action
masking. Invalid actions receive penalties only if masking disabled.
| Parameter | Default | Description |
| ------------------- | ------- | ----------------------------- |
| `profit_aim` | 0.03 | Profit target threshold |
-| `risk_reward_ratio` | 1.0 | Risk/reward multiplier |
-| `win_reward_factor` | 2.0 | Profit overshoot bonus factor |
+| `risk_reward_ratio` | 2.0 | Risk/reward multiplier |
+| `win_reward_factor` | 2.0 | Profit target bonus factor |
| `pnl_factor_beta` | 0.5 | PnL amplification sensitivity |
**Note:** In ReforceXY, `risk_reward_ratio` maps to `rr`.
#### Hold Potential Transforms
-| Parameter | Default | Description |
-| ----------------------------------- | ------- | -------------------- |
-| `hold_potential_ratio` | 0.25 | Hold potential ratio |
-| `hold_potential_gain` | 1.0 | Gain multiplier |
-| `hold_potential_transform_pnl` | tanh | PnL transform |
-| `hold_potential_transform_duration` | tanh | Duration transform |
+| Parameter | Default | Description |
+| ----------------------------------- | -------- | -------------------- |
+| `hold_potential_ratio` | 0.015625 | Hold potential ratio |
+| `hold_potential_gain` | 1.0 | Gain multiplier |
+| `hold_potential_transform_pnl` | tanh | PnL transform |
+| `hold_potential_transform_duration` | tanh | Duration transform |
**Hold Potential Formula:**
# Default discount factor γ for potential-based reward shaping
POTENTIAL_GAMMA_DEFAULT: float = 0.95
+# Default risk/reward ratio (RR)
+RISK_REWARD_RATIO_DEFAULT: float = 2.0
+
# Supported attenuation modes
ATTENUATION_MODES: Tuple[str, ...] = ("sqrt", "linear", "power", "half_life")
ATTENUATION_MODES_WITH_LEGACY: Tuple[str, ...] = ("legacy",) + ATTENUATION_MODES
"exit_potential_decay": 0.5,
# Hold potential (PBRS function Φ)
"hold_potential_enabled": True,
- "hold_potential_ratio": 0.25,
+ "hold_potential_ratio": 0.015625,
"hold_potential_gain": 1.0,
"hold_potential_transform_pnl": "tanh",
"hold_potential_transform_duration": "tanh",
for bkey in _bool_keys:
if bkey in sanitized:
original_val = sanitized[bkey]
- coerced = _to_bool(original_val)
- if coerced is not original_val:
- sanitized[bkey] = coerced
+ coerced_val = _to_bool(original_val)
+ if coerced_val is not original_val:
+ sanitized[bkey] = coerced_val
adjustments.setdefault(
bkey,
{
"original": original_val,
- "adjusted": coerced,
+ "adjusted": coerced_val,
"reason": "bool_coerce",
"validation_mode": "strict" if strict else "relaxed",
},
original_val = sanitized[key]
# Robust coercion to float using helper (handles None/str/bool/non-finite)
- coerced = _get_float_param({key: original_val}, key, np.nan)
+ coerced_val = _get_float_param({key: original_val}, key, np.nan)
# Handle non-numeric or unparsable values
- if not np.isfinite(coerced):
+ if not np.isfinite(coerced_val):
# Treat derived parameters specially: drop to allow downstream derivation
if key == "max_idle_duration_candles":
# Remove the key so downstream helpers derive from max_trade_duration_candles
}
continue
- original_numeric = float(coerced)
+ original_numeric = float(coerced_val)
# Track type coercion
if not isinstance(original_val, (int, float)):
if pnl_target > 0.0:
win_reward_factor = _get_float_param(params, "win_reward_factor")
pnl_factor_beta = _get_float_param(params, "pnl_factor_beta")
- rr = risk_reward_ratio if risk_reward_ratio > 0 else 1.0
+ rr = risk_reward_ratio if risk_reward_ratio > 0 else RISK_REWARD_RATIO_DEFAULT
pnl_ratio = pnl / pnl_target
if abs(pnl_ratio) > 1.0:
pnl_ratio = float(pnl / pnl_target)
duration_ratio = float(np.clip(duration_ratio, 0.0, 1.0))
- ratio = _get_float_param(params, scale_key, 0.25 if "hold" in scale_key else 0.125)
+ ratio = _get_float_param(params, scale_key)
scale = ratio * base_factor
- gain = _get_float_param(params, gain_key, 1.0)
- transform_pnl = _get_str_param(params, transform_pnl_key, "tanh")
- transform_duration = _get_str_param(params, transform_dur_key, "tanh")
+ gain = _get_float_param(params, gain_key)
+ transform_pnl = _get_str_param(params, transform_pnl_key)
+ transform_duration = _get_str_param(params, transform_dur_key)
duration_multiplier = 1.0
if risk_reward_ratio is not None:
parser.add_argument(
"--risk_reward_ratio",
type=float,
- default=1.0,
- help="Risk reward ratio multiplier (default: 1.0).",
+ default=RISK_REWARD_RATIO_DEFAULT,
+ help=f"Risk reward ratio multiplier (default: {RISK_REWARD_RATIO_DEFAULT}).",
)
parser.add_argument(
"--max_duration_ratio",
config = RewardScenarioConfig(
base_factor=PARAMS.BASE_FACTOR,
profit_aim=PARAMS.PROFIT_AIM,
- risk_reward_ratio=1.0,
+ risk_reward_ratio=PARAMS.RISK_REWARD_RATIO,
tolerance_relaxed=TOLERANCE.IDENTITY_RELAXED,
)
assert_reward_calculation_scenarios(
win_reward_factor = 3.0
beta = 0.5
profit_aim = PARAMS.PROFIT_AIM
+ risk_reward_ratio = PARAMS.RISK_REWARD_RATIO
+ pnl_target = profit_aim * risk_reward_ratio
params = self.base_params(
win_reward_factor=win_reward_factor,
pnl_factor_beta=beta,
exit_linear_slope=0.0,
)
params.pop("base_factor", None)
- pnl_values = [profit_aim * m for m in (1.05, PARAMS.RISK_REWARD_RATIO_HIGH, 5.0, 10.0)]
+ pnl_values = [pnl_target * m for m in (1.05, 2.0, 5.0, 10.0)]
ratios_observed: list[float] = []
for pnl in pnl_values:
context = self.make_ctx(
action=Actions.Long_exit,
)
br = calculate_reward_with_defaults(
- context, params, base_factor=1.0, profit_aim=profit_aim
+ context,
+ params,
+ base_factor=1.0,
+ profit_aim=profit_aim,
+ risk_reward_ratio=risk_reward_ratio,
)
ratio = br.exit_component / pnl if pnl != 0 else 0.0
ratios_observed.append(float(ratio))
)
expected_ratios: list[float] = []
for pnl in pnl_values:
- pnl_ratio = pnl / profit_aim
+ pnl_ratio = pnl / pnl_target
expected = 1.0 + win_reward_factor * math.tanh(beta * (pnl_ratio - 1.0))
expected_ratios.append(expected)
for obs, exp in zip(ratios_observed, expected_ratios):
"""
base_factor = PARAMS.BASE_FACTOR
profit_aim = PARAMS.PROFIT_AIM
- risk_reward_ratio = 1.0
+ risk_reward_ratio = PARAMS.RISK_REWARD_RATIO
max_trade_duration_candles = PARAMS.TRADE_DURATION_MEDIUM
params = self.base_params(
)
params_rr.pop("risk_reward_ratio", None)
- br_ratio = calculate_reward_with_defaults(context, params_ratio, risk_reward_ratio=1.0)
- br_rr = calculate_reward_with_defaults(context, params_rr, risk_reward_ratio=1.0)
+ br_ratio = calculate_reward_with_defaults(
+ context, params_ratio, risk_reward_ratio=PARAMS.RISK_REWARD_RATIO
+ )
+ br_rr = calculate_reward_with_defaults(
+ context, params_rr, risk_reward_ratio=PARAMS.RISK_REWARD_RATIO
+ )
self.assertAlmostEqualFloat(
br_rr.total,
Attributes:
BASE_FACTOR: Default base factor for reward scaling (90.0)
PROFIT_AIM: Target profit threshold (0.06)
- RISK_REWARD_RATIO: Standard risk/reward ratio (1.0)
- RISK_REWARD_RATIO_HIGH: High risk/reward ratio for stress tests (2.0)
+ RISK_REWARD_RATIO: Standard risk/reward ratio (2.0)
+ RISK_REWARD_RATIO_HIGH: High risk/reward ratio for stress tests (4.0)
PNL_STD: Standard deviation for PnL generation (0.02)
PNL_DUR_VOL_SCALE: Duration-based volatility scaling factor (0.001)
MAX_TRADE_DURATION_HETEROSCEDASTICITY: Max trade duration used for heteroscedasticity tests (10)
# Common additive parameters
- ADDITIVE_RATIO_DEFAULT: Default additive ratio (0.4)
+ ADDITIVE_RATIO_DEFAULT: Default additive ratio (0.125)
ADDITIVE_GAIN_DEFAULT: Default additive gain (1.0)
+
+ # PBRS hold potential parameters
+ HOLD_POTENTIAL_RATIO_DEFAULT: Default hold potential ratio (0.015625)
"""
BASE_FACTOR: float = 90.0
PROFIT_AIM: float = 0.06
- RISK_REWARD_RATIO: float = 1.0
- RISK_REWARD_RATIO_HIGH: float = 2.0
+ RISK_REWARD_RATIO: float = 2.0
+ RISK_REWARD_RATIO_HIGH: float = 4.0
PNL_STD: float = 0.02
PNL_DUR_VOL_SCALE: float = 0.001
MAX_TRADE_DURATION_HETEROSCEDASTICITY: int = 10
# Additive parameters
- ADDITIVE_RATIO_DEFAULT: float = 0.4
+ ADDITIVE_RATIO_DEFAULT: float = 0.125
ADDITIVE_GAIN_DEFAULT: float = 1.0
+ # PBRS hold potential parameters
+ HOLD_POTENTIAL_RATIO_DEFAULT: float = 0.015625
+
@dataclass(frozen=True)
class TestScenarios:
attenuation_modes: Sequence[str],
base_params_fn,
tolerance_relaxed: float,
- risk_reward_ratio: float = 1.0,
+ risk_reward_ratio: float = PARAMS.RISK_REWARD_RATIO,
):
"""Validate exit factor attenuation across multiple modes.
duration_ratio=case["duration_ratio"],
context=case["context"],
params=case["params"],
- risk_reward_ratio=2.0,
+ risk_reward_ratio=PARAMS.RISK_REWARD_RATIO,
)
exp = case.get("expectation")
if exp == "safe_zero":
self, _get_exit_factor, 90.0, 0.08, 0.03, 0.5, test_context,
bad_params={"exit_attenuation_mode": "power", "exit_power_tau": -1.0},
reference_params={"exit_attenuation_mode": "linear"},
- risk_reward_ratio=1.0
+ risk_reward_ratio=PARAMS.RISK_REWARD_RATIO
)
"""
_get_bool_param,
)
+from ..constants import PARAMS
from ..test_base import make_ctx
from . import calculate_reward_with_defaults
params,
base_factor=100.0,
profit_aim=0.05,
- risk_reward_ratio=1.0,
+ risk_reward_ratio=PARAMS.RISK_REWARD_RATIO,
prev_potential=np.nan,
)
assert math.isfinite(breakdown.prev_potential)
_LOG_2: Final[float] = math.log(2.0)
+ DEFAULT_BASE_FACTOR: Final[float] = 100.0
+
DEFAULT_MAX_TRADE_DURATION_CANDLES: Final[int] = 128
DEFAULT_IDLE_DURATION_MULTIPLIER: Final[int] = 4
- DEFAULT_BASE_FACTOR: Final[float] = 100.0
- DEFAULT_EFFICIENCY_WEIGHT: Final[float] = 1.0
-
DEFAULT_EXIT_POTENTIAL_DECAY: Final[float] = 0.5
DEFAULT_ENTRY_ADDITIVE_ENABLED: Final[bool] = False
DEFAULT_ENTRY_ADDITIVE_RATIO: Final[float] = 0.125
DEFAULT_ENTRY_ADDITIVE_GAIN: Final[float] = 1.0
DEFAULT_HOLD_POTENTIAL_ENABLED: Final[bool] = True
- DEFAULT_HOLD_POTENTIAL_RATIO: Final[float] = 0.25
+ DEFAULT_HOLD_POTENTIAL_RATIO: Final[float] = 0.015625
DEFAULT_HOLD_POTENTIAL_GAIN: Final[float] = 1.0
DEFAULT_EXIT_ADDITIVE_ENABLED: Final[bool] = False
DEFAULT_EXIT_ADDITIVE_RATIO: Final[float] = 0.125
DEFAULT_PNL_FACTOR_BETA: Final[float] = 0.5
DEFAULT_WIN_REWARD_FACTOR: Final[float] = 2.0
+ DEFAULT_EFFICIENCY_WEIGHT: Final[float] = 1.0
DEFAULT_EFFICIENCY_CENTER: Final[float] = 0.5
DEFAULT_INVALID_ACTION: Final[float] = -2.0
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
+
self.pairs: List[str] = self.config.get("exchange", {}).get("pair_whitelist")
if not self.pairs:
raise ValueError(
reward_shaping = gamma * next_potential - prev_potential
if self._exit_additive_enabled and not self.is_pbrs_invariant_mode():
- duration_ratio = trade_duration / max(max_trade_duration, 1)
+ duration_ratio = trade_duration / max(1, max_trade_duration)
exit_additive = self._compute_exit_additive(
pnl, pnl_target, duration_ratio, exit_additive_scale
)
def _compute_gradient_steps(tf: int, ss: int) -> int:
if tf > 0 and ss > 0:
- return min(tf, max(math.ceil(tf / ss), 1))
+ return min(tf, max(1, math.ceil(tf / ss)))
return -1