| freqai.optuna_hyperopt.enabled | true | bool | Enables HPO. |
| freqai.optuna_hyperopt.n_jobs | CPU threads / 4 | int >= 1 | Parallel HPO workers. |
| freqai.optuna_hyperopt.storage | `file` | enum {`file`,`sqlite`} | HPO storage backend. |
-| freqai.optuna_hyperopt.continuous | true | bool | Continuous HPO. |
-| freqai.optuna_hyperopt.warm_start | true | bool | Warm start HPO with previous best value(s). |
+| freqai.optuna_hyperopt.continuous | false | bool | Continuous HPO. |
+| freqai.optuna_hyperopt.warm_start | false | bool | Warm start HPO with previous best value(s). |
| freqai.optuna_hyperopt.n_startup_trials | 15 | int >= 0 | HPO startup trials. |
| freqai.optuna_hyperopt.n_trials | 50 | int >= 1 | Maximum HPO trials. |
| freqai.optuna_hyperopt.timeout | 7200 | int >= 0 | HPO wall-clock timeout in seconds. |
## Prerequisites
-Requirements: Python 3.8+, ≥4GB RAM (CPU only). Recommended venv:
+Requirements: Python 3.9+, ≥4GB RAM (CPU only). Recommended venv:
```shell
cd ReforceXY/reward_space_analysis
**`--seed`** (int, default: 42) – Master seed (reuse for identical runs).
-**`--max_trade_duration`** (int, default: 128) – Max trade duration (candles). Idle grace fallback: `max_idle_duration_candles = 4 * max_trade_duration`.
-
### Reward Configuration
**`--base_factor`** (float, default: 100.0) – Base reward scale (match environment).
| `win_reward_factor` | 2.0 | Profit overshoot multiplier |
| `pnl_factor_beta` | 0.5 | PnL amplification beta |
| `idle_penalty_scale` | 0.5 | Idle penalty scale |
-| `idle_penalty_power` | 1.025 | Idle penalty exponent (>1 slightly convex) |
+| `idle_penalty_power` | 1.025 | Idle penalty exponent |
+| `max_trade_duration_candles` | 128 | Trade duration cap |
| `max_idle_duration_candles` | None | Idle duration cap; fallback 4× max trade duration |
| `hold_penalty_scale` | 0.25 | Hold penalty scale |
| `hold_penalty_power` | 1.025 | Hold penalty exponent |
| `generated_at` | string (ISO 8601) | Timestamp of generation (not part of hash). |
| `num_samples` | int | Number of synthetic samples generated. |
| `seed` | int | Master random seed driving simulation determinism. |
-| `max_trade_duration` | int | Max trade duration used to scale durations. |
| `profit_target_effective` | float | Profit target after risk/reward scaling. |
| `pvalue_adjust_method` | string | Multiple testing correction mode (`none` or `benjamini_hochberg`). |
| `parameter_adjustments` | object | Map of any automatic bound clamps (empty if none). |
| `reward_params` | object | Full resolved reward parameter set (post-validation). |
| `simulation_params` | object | All simulation inputs (num_samples, seed, volatility knobs, etc.). |
-| `params_hash` | string (sha256) | Hash over ALL `simulation_params` + ALL `reward_params` (lexicographically ordered). |
+| `params_hash` | string (sha256) | Hash over ALL `simulation_params` (excluding `out_dir`, `real_episodes`) + ALL `reward_params` (lexicographically ordered). |
Two runs match iff `params_hash` identical (defaults included in hash scope).
### Memory Errors
Reduce samples; ensure 64‑bit Python; batch processing; add RAM/swap.
-
[tool.pytest.ini_options]
-# Pytest configuration following GitHub Copilot instructions:
-# - Single source of truth for test configuration
-# - Reproducible test execution
-# - Clear error reporting
-
minversion = "6.0"
testpaths = [
"."
"--color=yes"
]
-# Test markers
-markers = [
- "integration: Integration tests requiring external dependencies",
- "unit: Fast unit tests",
- "statistical: Statistical validation tests",
- "slow: Tests that take more than a few seconds"
-]
-
# Minimum Python version
-python_version = "3.8"
+python_version = "3.9"
+
+[tool.ruff]
+line-length = 100
+target-version = "py39"
+
+[tool.ruff.lint]
+select = ["E", "F", "W", "I"]
+ignore = ["E501"]
import argparse
import dataclasses
+import hashlib
+import json
import math
+import numbers
import pickle
import random
import warnings
import pandas as pd
from scipy import stats
from scipy.spatial.distance import jensenshannon
-from scipy.stats import entropy
-from sklearn.ensemble import RandomForestRegressor
-from sklearn.inspection import partial_dependence, permutation_importance
-from sklearn.metrics import r2_score
-from sklearn.model_selection import train_test_split
+from scipy.stats import entropy, probplot
+
+try:
+ from sklearn.ensemble import RandomForestRegressor
+ from sklearn.inspection import partial_dependence, permutation_importance
+ from sklearn.metrics import r2_score
+ from sklearn.model_selection import train_test_split
+except Exception:
+ RandomForestRegressor = None
+ partial_dependence = None
+ permutation_importance = None
+ r2_score = None
+ train_test_split = None
class Actions(IntEnum):
"distribution_constant_fallback_qq_r2": 1.0,
"moment_extreme_threshold": 1e4,
"bootstrap_min_recommended": 200,
+ "sim_pnl_conservation_tol": 1e-10,
+ "sim_zero_pnl_epsilon": 1e-12,
+ "sim_zero_reward_epsilon": 1e-12,
+ "sim_extreme_pnl_threshold": 0.2,
}
# PBRS constants
# Idle penalty (env defaults)
"idle_penalty_scale": 0.5,
"idle_penalty_power": 1.025,
- # Fallback: 2 * max_trade_duration_candles
+ "max_trade_duration_candles": 128,
+ # Fallback: DEFAULT_IDLE_DURATION_MULTIPLIER * max_trade_duration_candles
"max_idle_duration_candles": None,
# Hold penalty (env defaults)
"hold_penalty_scale": 0.25,
"base_factor": "Base reward scale",
"idle_penalty_power": "Idle penalty exponent",
"idle_penalty_scale": "Idle penalty scale",
+ "max_trade_duration_candles": "Trade duration cap (candles)",
"max_idle_duration_candles": "Idle duration cap (candles)",
"hold_penalty_scale": "Hold penalty scale",
"hold_penalty_power": "Hold penalty exponent",
"base_factor": {"min": 0.0},
"idle_penalty_power": {"min": 0.0},
"idle_penalty_scale": {"min": 0.0},
+ "max_trade_duration_candles": {"min": 1.0},
"max_idle_duration_candles": {"min": 0.0},
"hold_penalty_scale": {"min": 0.0},
"hold_penalty_power": {"min": 0.0},
return True
if text in {"false", "0", "no", "n", "off"}:
return False
- return bool(text)
+ # Unsupported type
+ raise ValueError(f"Unrecognized boolean literal: {value!r}")
def _get_bool_param(params: RewardParams, key: str, default: bool) -> bool:
return bool(default)
-def _get_float_param(
- params: RewardParams, key: str, default: RewardParamValue
-) -> float:
+def _is_strict_validation(params: RewardParams) -> bool:
+ """Return strict validation flag from params (default True)."""
+ return _get_bool_param(params, "strict_validation", True)
+
+
+def _get_float_param(params: RewardParams, key: str, default: RewardParamValue) -> float:
"""Extract float parameter with type safety and default fallback."""
value = params.get(key, default)
# None -> NaN
return default
-def _compute_duration_ratio(trade_duration: int, max_trade_duration: int) -> float:
+def _compute_duration_ratio(trade_duration: int, max_trade_duration_candles: int) -> float:
"""Compute duration ratio with safe division."""
- return trade_duration / max(1, max_trade_duration)
+ return trade_duration / max(1, max_trade_duration_candles)
def _is_short_allowed(trading_mode: str) -> bool:
return 0.0
+def get_max_idle_duration_candles(
+ params: RewardParams,
+ *,
+ max_trade_duration_candles: Optional[int] = None,
+) -> int:
+ mtd = (
+ int(max_trade_duration_candles)
+ if isinstance(max_trade_duration_candles, (int, float))
+ else None
+ )
+ if mtd is None or mtd <= 0:
+ mtd = _get_int_param(
+ params,
+ "max_trade_duration_candles",
+ DEFAULT_MODEL_REWARD_PARAMETERS.get("max_trade_duration_candles", 128),
+ )
+ if mtd <= 0:
+ mtd = int(DEFAULT_MODEL_REWARD_PARAMETERS.get("max_trade_duration_candles", 128))
+
+ default_mid = int(DEFAULT_IDLE_DURATION_MULTIPLIER * int(mtd))
+ mid = _get_int_param(params, "max_idle_duration_candles", default_mid)
+ if mid <= 0:
+ mid = default_mid
+ return int(mid)
+
+
def validate_reward_parameters(
params: RewardParams,
+ strict: bool = True,
) -> Tuple[RewardParams, Dict[str, Dict[str, Any]]]:
- """Clamp parameters to bounds and coerce booleans.
+ """Clamp parameters to bounds and coerce booleans and numeric overrides.
- Returns sanitized copy plus adjustments mapping (param -> original/adjusted/reason).
- Non‑finite numerics fall back to min bound or 0.0.
+ Returns a sanitized copy plus adjustments mapping (param -> original/adjusted/reason).
+ Behavior:
+ - Boolean-like keys are coerced to bool.
+ - Numeric-bounded keys are coerced to float when provided as str/bool/None.
+ * In strict mode: raise on non-numeric or out-of-bounds.
+ * In relaxed mode: fallback to min bound or 0.0 with adjustment reason.
+ - Non‑finite numerics fall back to min bound or 0.0 (relaxed) or raise (strict).
"""
sanitized = dict(params)
adjustments: Dict[str, Dict[str, Any]] = {}
+
# Normalize boolean-like parameters explicitly to avoid inconsistent types
_bool_keys = [
"check_invariants",
coerced = _to_bool(original_val)
if coerced is not original_val:
sanitized[bkey] = coerced
- adjustments.setdefault(
- bkey,
- {
- "original": original_val,
- "adjusted": coerced,
- "reason": "bool_coerce",
- },
- )
+ adjustments.setdefault(
+ bkey,
+ {
+ "original": original_val,
+ "adjusted": coerced,
+ "reason": "bool_coerce",
+ "validation_mode": "strict" if strict else "relaxed",
+ },
+ )
+
+ # Coerce and clamp numeric-bounded parameters
for key, bounds in _PARAMETER_BOUNDS.items():
if key not in sanitized:
continue
- value = sanitized[key]
- if not isinstance(value, (int, float)):
+
+ original_val = sanitized[key]
+ # Robust coercion to float using helper (handles None/str/bool/non-finite)
+ coerced = _get_float_param({key: original_val}, key, np.nan)
+
+ # Handle non-numeric or unparsable values
+ if not np.isfinite(coerced):
+ # Treat derived parameters specially: drop to allow downstream derivation
+ if key == "max_idle_duration_candles":
+ # Remove the key so downstream helpers derive from max_trade_duration_candles
+ del sanitized[key]
+ adjustments[key] = {
+ "original": original_val,
+ "adjusted": None,
+ "reason": "derived_default",
+ "validation_mode": "strict" if strict else "relaxed",
+ }
+ continue
+ if strict:
+ raise ValueError(f"Parameter '{key}' is non-numeric or invalid: {original_val!r}")
+ adjusted = bounds.get("min", 0.0)
+ sanitized[key] = adjusted
+ adjustments[key] = {
+ "original": original_val,
+ "adjusted": adjusted,
+ "reason": "non_numeric_reset",
+ "validation_mode": "strict" if strict else "relaxed",
+ }
continue
- original = float(value)
- adjusted = original
+
+ original_numeric = float(coerced)
+ adjusted = original_numeric
reason_parts: List[str] = []
+
+ # Record numeric coercion if type changed (e.g., from str/bool/None)
+ if not isinstance(original_val, (int, float)):
+ adjustments.setdefault(
+ key,
+ {
+ "original": original_val,
+ "adjusted": original_numeric,
+ "reason": "numeric_coerce",
+ "validation_mode": "strict" if strict else "relaxed",
+ },
+ )
+ # Update sanitized to numeric before clamping
+ sanitized[key] = original_numeric
+
+ # Bounds enforcement
if "min" in bounds and adjusted < bounds["min"]:
+ if strict:
+ raise ValueError(f"Parameter '{key}'={adjusted} below min {bounds['min']}")
adjusted = bounds["min"]
reason_parts.append(f"min={bounds['min']}")
if "max" in bounds and adjusted > bounds["max"]:
+ if strict:
+ raise ValueError(f"Parameter '{key}'={adjusted} above max {bounds['max']}")
adjusted = bounds["max"]
reason_parts.append(f"max={bounds['max']}")
+
if not np.isfinite(adjusted):
+ if strict:
+ raise ValueError(f"Parameter '{key}' is non-finite: {adjusted}")
adjusted = bounds.get("min", 0.0)
reason_parts.append("non_finite_reset")
- if not np.isclose(adjusted, original):
+
+ if not np.isclose(adjusted, original_numeric):
sanitized[key] = adjusted
+ prev_reason = adjustments.get(key, {}).get("reason")
+ reason: List[str] = []
+ if prev_reason:
+ reason.append(prev_reason)
+ reason.extend(reason_parts)
+ reason_str = ",".join(reason) if reason else "clamp"
adjustments[key] = {
- "original": original,
+ "original": original_val,
"adjusted": adjusted,
- "reason": ",".join(reason_parts), # textual reason directly
+ "reason": reason_str,
+ "validation_mode": "strict" if strict else "relaxed",
}
+
return sanitized, adjustments
- For exit_attenuation_mode, enforce allowed choices (case-sensitive).
- Skip keys already managed as top-level options (e.g., base_factor) to avoid duplicates.
"""
- skip_keys = {"base_factor"} # already defined as top-level
+ skip_keys = {"base_factor"}
for key, default in DEFAULT_MODEL_REWARD_PARAMETERS.items():
if key in skip_keys:
continue
- help_text = DEFAULT_MODEL_REWARD_PARAMETERS_HELP.get(
- key, f"Override tunable '{key}'."
- )
+ help_text = DEFAULT_MODEL_REWARD_PARAMETERS_HELP.get(key, f"Override tunable '{key}'.")
if key == "exit_attenuation_mode":
parser.add_argument(
f"--{key}",
else:
# Map numerics to float; leave strings as str
if isinstance(default, (int, float)):
- parser.add_argument(
- f"--{key}", type=float, default=None, help=help_text
- )
+ parser.add_argument(f"--{key}", type=float, default=None, help=help_text)
else:
parser.add_argument(f"--{key}", type=str, default=None, help=help_text)
@dataclasses.dataclass
class RewardContext:
+ """Context for reward computation."""
+
pnl: float
trade_duration: int
idle_duration: int
- max_trade_duration: int
max_unrealized_profit: float
min_unrealized_profit: float
position: Positions
hold_penalty: float = 0.0
exit_component: float = 0.0
# PBRS components
- shaping_reward: float = 0.0
+ reward_shaping: float = 0.0
entry_additive: float = 0.0
exit_additive: float = 0.0
- current_potential: float = 0.0
+ prev_potential: float = 0.0
next_potential: float = 0.0
duration_ratio: float,
params: RewardParams,
) -> float:
- """Exit attenuation factor (kernel + optional plateau) * pnl_factor with invariants."""
+ """Exit factor (kernel + optional plateau) * pnl_factor with invariants."""
# Basic finiteness checks
- if (
- not np.isfinite(base_factor)
- or not np.isfinite(pnl)
- or not np.isfinite(duration_ratio)
- ):
+ if not np.isfinite(base_factor) or not np.isfinite(pnl) or not np.isfinite(duration_ratio):
return _fail_safely("non_finite_exit_factor_inputs")
# Guard: duration ratio should never be negative
DEFAULT_MODEL_REWARD_PARAMETERS.get("exit_plateau_grace", 1.0),
)
if exit_plateau_grace < 0.0:
- exit_plateau_grace = 1.0
+ warnings.warn(
+ "exit_plateau_grace < 0; falling back to 0.0",
+ RewardDiagnosticsWarning,
+ stacklevel=2,
+ )
+ exit_plateau_grace = 0.0
exit_linear_slope = _get_float_param(
params,
"exit_linear_slope",
DEFAULT_MODEL_REWARD_PARAMETERS.get("exit_linear_slope", 1.0),
)
if exit_linear_slope < 0.0:
- exit_linear_slope = 1.0
+ warnings.warn(
+ "exit_linear_slope < 0; falling back to 0.0",
+ RewardDiagnosticsWarning,
+ stacklevel=2,
+ )
+ exit_linear_slope = 0.0
def _legacy_kernel(f: float, dr: float) -> float:
return f * (1.5 if dr <= 1.0 else 0.5)
if 0.0 < tau <= 1.0:
alpha = -math.log(tau) / _LOG_2
else:
+ if _is_strict_validation(params):
+ raise ValueError(f"exit_power_tau={tau} must be in (0,1] in strict mode")
+ warnings.warn(
+ f"exit_power_tau={tau} invalid; falling back to alpha=1.0",
+ RewardDiagnosticsWarning,
+ stacklevel=2,
+ )
alpha = 1.0
return f / math.pow(1.0 + dr, alpha)
DEFAULT_MODEL_REWARD_PARAMETERS.get("exit_half_life", 0.5),
)
if hl <= 0.0:
- hl = 0.5
+ if _is_strict_validation(params):
+ raise ValueError(f"exit_half_life={hl} must be > 0 in strict mode")
+ warnings.warn(
+ f"exit_half_life={hl} <= 0; falling back to 0.0",
+ RewardDiagnosticsWarning,
+ stacklevel=2,
+ )
+ hl = 0.0
return f * math.pow(2.0, -dr / hl)
kernels = {
f"Unknown exit_attenuation_mode '{exit_attenuation_mode}'; defaulting to 'linear' "
f"(effective_dr={effective_dr:.5f})"
),
- RuntimeWarning,
+ RewardDiagnosticsWarning,
stacklevel=2,
)
kernel = _linear_kernel
try:
- base_factor = kernel(base_factor, effective_dr)
+ attenuation_factor = kernel(base_factor, effective_dr)
except Exception as e:
warnings.warn(
f"exit_attenuation_mode '{exit_attenuation_mode}' failed ({e!r}); fallback linear (effective_dr={effective_dr:.5f})",
- RuntimeWarning,
+ RewardDiagnosticsWarning,
stacklevel=2,
)
- base_factor = _linear_kernel(base_factor, effective_dr)
+ attenuation_factor = _linear_kernel(base_factor, effective_dr)
- # Apply pnl_factor after time attenuation
- base_factor *= pnl_factor
+ exit_factor = attenuation_factor * pnl_factor
- # Invariant & safety checks
if _get_bool_param(
params,
"check_invariants",
bool(DEFAULT_MODEL_REWARD_PARAMETERS.get("check_invariants", True)),
):
- if not np.isfinite(base_factor):
+ if not np.isfinite(exit_factor):
return _fail_safely("non_finite_exit_factor_after_kernel")
- if base_factor < 0.0 and pnl >= 0.0:
- # Clamp: avoid negative amplification on non-negative pnl
- base_factor = 0.0
+ if exit_factor < 0.0 and pnl >= 0.0:
+ exit_factor = 0.0
exit_factor_threshold = _get_float_param(
params,
"exit_factor_threshold",
DEFAULT_MODEL_REWARD_PARAMETERS.get("exit_factor_threshold", 10000.0),
)
if exit_factor_threshold > 0 and np.isfinite(exit_factor_threshold):
- if abs(base_factor) > exit_factor_threshold:
+ if abs(exit_factor) > exit_factor_threshold:
warnings.warn(
(
- f"_get_exit_factor |factor|={abs(base_factor):.2f} exceeds threshold {exit_factor_threshold:.2f}"
+ f"_get_exit_factor |factor|={abs(exit_factor):.2f} exceeds threshold {exit_factor_threshold:.2f}"
),
- RuntimeWarning,
+ RewardDiagnosticsWarning,
stacklevel=2,
)
- return base_factor
+ return exit_factor
def _get_pnl_factor(
) -> float:
"""PnL factor: tanh overshoot/loss modulation + efficiency tilt (non-negative)."""
pnl = context.pnl
- if (
- not np.isfinite(pnl)
- or not np.isfinite(profit_target)
- or not np.isfinite(risk_reward_ratio)
- ):
+ if not np.isfinite(pnl) or not np.isfinite(profit_target) or not np.isfinite(risk_reward_ratio):
return _fail_safely("non_finite_inputs_pnl_factor")
if profit_target <= 0.0:
return 0.0
if np.isfinite(range_pnl) and not np.isclose(range_pnl, 0.0):
efficiency_ratio = (pnl - min_pnl) / range_pnl
if pnl > 0.0:
- efficiency_factor = 1.0 + efficiency_weight * (
- efficiency_ratio - efficiency_center
- )
+ efficiency_factor = 1.0 + efficiency_weight * (efficiency_ratio - efficiency_center)
elif pnl < 0.0:
- efficiency_factor = 1.0 + efficiency_weight * (
- efficiency_center - efficiency_ratio
- )
+ efficiency_factor = 1.0 + efficiency_weight * (efficiency_center - efficiency_ratio)
return max(0.0, pnl_target_factor * efficiency_factor)
return False
-def _idle_penalty(
- context: RewardContext, idle_factor: float, params: RewardParams
-) -> float:
+def _idle_penalty(context: RewardContext, idle_factor: float, params: RewardParams) -> float:
"""Mirror the environment's idle penalty behavior."""
idle_penalty_scale = _get_float_param(
params,
"idle_penalty_power",
DEFAULT_MODEL_REWARD_PARAMETERS.get("idle_penalty_power", 1.025),
)
- max_trade_duration_candles = _get_int_param(
- params, "max_trade_duration_candles", context.max_trade_duration
- )
- if max_trade_duration_candles <= 0:
- max_trade_duration_candles = int(context.max_trade_duration)
-
- max_idle_duration_candles = _get_int_param(
- params,
- "max_idle_duration_candles",
- DEFAULT_IDLE_DURATION_MULTIPLIER * max_trade_duration_candles,
- )
- if max_idle_duration_candles <= 0:
- max_idle_duration_candles = (
- DEFAULT_IDLE_DURATION_MULTIPLIER * max_trade_duration_candles
- )
- max_idle_duration = max_idle_duration_candles
-
- idle_duration_ratio = context.idle_duration / max(1, max_idle_duration)
+ max_idle_duration_candles = get_max_idle_duration_candles(params)
+ idle_duration_ratio = context.idle_duration / max(1, max_idle_duration_candles)
return -idle_factor * idle_penalty_scale * idle_duration_ratio**idle_penalty_power
-def _hold_penalty(
- context: RewardContext, hold_factor: float, params: RewardParams
-) -> float:
+def _hold_penalty(context: RewardContext, hold_factor: float, params: RewardParams) -> float:
"""Mirror the environment's hold penalty behavior."""
hold_penalty_scale = _get_float_param(
params,
"hold_penalty_power",
DEFAULT_MODEL_REWARD_PARAMETERS.get("hold_penalty_power", 1.025),
)
- duration_ratio = _compute_duration_ratio(
- context.trade_duration, context.max_trade_duration
+ max_trade_duration_candles = _get_int_param(
+ params,
+ "max_trade_duration_candles",
+ DEFAULT_MODEL_REWARD_PARAMETERS.get("max_trade_duration_candles", 128),
)
+ duration_ratio = _compute_duration_ratio(context.trade_duration, max_trade_duration_candles)
if duration_ratio < 1.0:
return _fail_safely("hold_penalty_duration_ratio_lt_1")
- return (
- -hold_factor * hold_penalty_scale * (duration_ratio - 1.0) ** hold_penalty_power
- )
+ return -hold_factor * hold_penalty_scale * (duration_ratio - 1.0) ** hold_penalty_power
def _compute_exit_reward(
params: RewardParams,
) -> float:
"""Compose the exit reward: pnl * exit_factor."""
- duration_ratio = _compute_duration_ratio(
- context.trade_duration, context.max_trade_duration
- )
- exit_factor = _get_exit_factor(
- base_factor, context.pnl, pnl_factor, duration_ratio, params
+ max_trade_duration_candles = _get_int_param(
+ params,
+ "max_trade_duration_candles",
+ DEFAULT_MODEL_REWARD_PARAMETERS.get("max_trade_duration_candles", 128),
)
+ duration_ratio = _compute_duration_ratio(context.trade_duration, max_trade_duration_candles)
+ exit_factor = _get_exit_factor(base_factor, context.pnl, pnl_factor, duration_ratio, params)
return context.pnl * exit_factor
*,
short_allowed: bool,
action_masking: bool,
- previous_potential: float = 0.0,
+ previous_potential: float = np.nan,
) -> RewardBreakdown:
breakdown = RewardBreakdown()
profit_target = _get_float_param(params, "profit_target", float(profit_target))
if "risk_reward_ratio" in params:
- risk_reward_ratio = _get_float_param(
- params, "risk_reward_ratio", float(risk_reward_ratio)
- )
+ risk_reward_ratio = _get_float_param(params, "risk_reward_ratio", float(risk_reward_ratio))
profit_target_final = profit_target * risk_reward_ratio
idle_factor = factor * profit_target_final / 4.0
)
hold_factor = idle_factor
- # Base reward calculation (existing logic)
+ # Base reward calculation
base_reward = 0.0
if context.action == Actions.Neutral and context.position == Positions.Neutral:
base_reward = _idle_penalty(context, idle_factor, params)
breakdown.idle_penalty = base_reward
elif (
- context.position in (Positions.Long, Positions.Short)
- and context.action == Actions.Neutral
+ context.position in (Positions.Long, Positions.Short) and context.action == Actions.Neutral
):
base_reward = _hold_penalty(context, hold_factor, params)
breakdown.hold_penalty = base_reward
base_reward = 0.0
# === PBRS INTEGRATION ===
- # Determine state transitions for PBRS
current_pnl = context.pnl if context.position != Positions.Neutral else 0.0
- current_duration_ratio = (
- context.trade_duration / context.max_trade_duration
- if context.position != Positions.Neutral and context.max_trade_duration > 0
- else 0.0
+ max_trade_duration_candles = _get_int_param(
+ params,
+ "max_trade_duration_candles",
+ DEFAULT_MODEL_REWARD_PARAMETERS.get("max_trade_duration_candles", 128),
+ )
+ current_duration_ratio = _compute_duration_ratio(
+ context.trade_duration, max_trade_duration_candles
)
- # Simulate next state for PBRS calculation
- is_terminal = context.action in (Actions.Long_exit, Actions.Short_exit)
+ is_entry = context.position == Positions.Neutral and context.action in (
+ Actions.Long_enter,
+ Actions.Short_enter,
+ )
+ is_exit = context.position in (
+ Positions.Long,
+ Positions.Short,
+ ) and context.action in (Actions.Long_exit, Actions.Short_exit)
+ is_hold = (
+ context.position in (Positions.Long, Positions.Short) and context.action == Actions.Neutral
+ )
+ is_neutral = context.position == Positions.Neutral and context.action == Actions.Neutral
- # For terminal transitions, next state is neutral (PnL=0, duration=0)
- if is_terminal:
+ if is_entry:
+ next_pnl = current_pnl
+ next_duration_ratio = 0.0
+ elif is_hold:
+ next_duration_ratio = _compute_duration_ratio(
+ context.trade_duration + 1, max_trade_duration_candles
+ )
+ # Optionally simulate unrealized PnL during holds to feed Φ(s)
+ if _get_bool_param(params, "unrealized_pnl", False):
+ center_unrealized = 0.5 * (
+ context.max_unrealized_profit + context.min_unrealized_profit
+ )
+ beta = _get_float_param(
+ params,
+ "pnl_factor_beta",
+ DEFAULT_MODEL_REWARD_PARAMETERS.get("pnl_factor_beta", 0.5),
+ )
+ next_pnl = float(center_unrealized * math.tanh(beta * next_duration_ratio))
+ else:
+ next_pnl = current_pnl
+ elif is_exit:
next_pnl = 0.0
next_duration_ratio = 0.0
else:
- # For non-terminal, use current values (simplified simulation)
next_pnl = current_pnl
next_duration_ratio = current_duration_ratio
- # Apply PBRS if any PBRS parameters are enabled
+ # Apply PBRS only if enabled and not neutral self-loop
pbrs_enabled = (
_get_bool_param(
params,
)
)
- if pbrs_enabled:
- total_reward, shaping_reward, next_potential = apply_potential_shaping(
+ if pbrs_enabled and not is_neutral:
+ # Derive Φ(prev) from current state to ensure telescoping semantics
+ prev_potential = _compute_hold_potential(current_pnl, current_duration_ratio, params)
+ if not np.isfinite(prev_potential):
+ prev_potential = 0.0
+ # Effective previous potential used for reporting: prefer provided previous_potential if finite
+ prev_potential = (
+ float(previous_potential) if np.isfinite(previous_potential) else float(prev_potential)
+ )
+
+ total_reward, reward_shaping, next_potential = apply_potential_shaping(
base_reward=base_reward,
current_pnl=current_pnl,
current_duration_ratio=current_duration_ratio,
next_pnl=next_pnl,
next_duration_ratio=next_duration_ratio,
- is_terminal=is_terminal,
- last_potential=previous_potential,
+ is_exit=is_exit,
+ is_entry=is_entry,
+ previous_potential=previous_potential,
params=params,
)
- # Update breakdown with PBRS components
- breakdown.shaping_reward = shaping_reward
- breakdown.current_potential = _compute_hold_potential(
- current_pnl, current_duration_ratio, params
- )
+ breakdown.reward_shaping = reward_shaping
+ breakdown.prev_potential = prev_potential
breakdown.next_potential = next_potential
- breakdown.entry_additive = _compute_entry_additive(
- current_pnl, current_duration_ratio, params
+ breakdown.entry_additive = (
+ _compute_entry_additive(next_pnl, next_duration_ratio, params) if is_entry else 0.0
)
breakdown.exit_additive = (
- _compute_exit_additive(next_pnl, next_duration_ratio, params)
- if is_terminal
- else 0.0
+ _compute_exit_additive(current_pnl, current_duration_ratio, params) if is_exit else 0.0
)
breakdown.total = total_reward
else:
num_samples: int,
seed: int,
params: RewardParams,
- max_trade_duration: int,
base_factor: float,
profit_target: float,
risk_reward_ratio: float,
pnl_base_std: float,
pnl_duration_vol_scale: float,
) -> pd.DataFrame:
+ """Simulate synthetic samples for reward analysis."""
rng = random.Random(seed)
+ max_trade_duration_candles = _get_int_param(
+ params,
+ "max_trade_duration_candles",
+ DEFAULT_MODEL_REWARD_PARAMETERS.get("max_trade_duration_candles", 128),
+ )
short_allowed = _is_short_allowed(trading_mode)
action_masking = _get_bool_param(params, "action_masking", True)
+ # Theoretical PBRS invariance flag
+ exit_mode = _get_str_param(
+ params,
+ "exit_potential_mode",
+ str(DEFAULT_MODEL_REWARD_PARAMETERS.get("exit_potential_mode", "canonical")),
+ )
+ entry_enabled = _get_bool_param(
+ params,
+ "entry_additive_enabled",
+ bool(DEFAULT_MODEL_REWARD_PARAMETERS.get("entry_additive_enabled", False)),
+ )
+ exit_enabled = _get_bool_param(
+ params,
+ "exit_additive_enabled",
+ bool(DEFAULT_MODEL_REWARD_PARAMETERS.get("exit_additive_enabled", False)),
+ )
+ pbrs_invariant = bool(exit_mode == "canonical" and not (entry_enabled or exit_enabled))
samples: list[Dict[str, float]] = []
+ last_potential: float = 0.0
for _ in range(num_samples):
if short_allowed:
- position_choices = [Positions.Neutral, Positions.Long, Positions.Short]
+ position_choices = [
+ Positions.Neutral,
+ Positions.Long,
+ Positions.Short,
+ ]
position_weights = [0.45, 0.3, 0.25]
else:
position_choices = [Positions.Neutral, Positions.Long]
if position == Positions.Neutral:
trade_duration = 0
- max_idle_duration_candles = _get_int_param(
- params,
- "max_idle_duration_candles",
- int(max_trade_duration * max_duration_ratio),
+ max_idle_duration_candles = get_max_idle_duration_candles(
+ params, max_trade_duration_candles=max_trade_duration_candles
)
- if max_idle_duration_candles <= 0:
- max_idle_duration_candles = int(max_trade_duration * max_duration_ratio)
-
idle_duration = int(rng.uniform(0, max_idle_duration_candles))
else:
- trade_duration = int(
- rng.uniform(1, max_trade_duration * max_duration_ratio)
- )
+ trade_duration = int(rng.uniform(1, max_trade_duration_candles * max_duration_ratio))
trade_duration = max(1, trade_duration)
idle_duration = 0
# Generate PnL only for exit actions (Long_exit=2, Short_exit=4)
if action in (Actions.Long_exit, Actions.Short_exit):
- # Apply directional bias for positions
- duration_factor = trade_duration / max(1, max_trade_duration)
+ duration_ratio = _compute_duration_ratio(trade_duration, max_trade_duration_candles)
# PnL variance scales with duration for more realistic heteroscedasticity
- pnl_std = pnl_base_std * (1.0 + pnl_duration_vol_scale * duration_factor)
+ pnl_std = pnl_base_std * (1.0 + pnl_duration_vol_scale * duration_ratio)
pnl = rng.gauss(0.0, pnl_std)
if position == Positions.Long:
- pnl += 0.005 * duration_factor
+ pnl += 0.005 * duration_ratio
elif position == Positions.Short:
- pnl -= 0.005 * duration_factor
+ pnl -= 0.005 * duration_ratio
# Clip PnL to realistic range
pnl = max(min(pnl, 0.15), -0.15)
pnl=pnl,
trade_duration=trade_duration,
idle_duration=idle_duration,
- max_trade_duration=max_trade_duration,
max_unrealized_profit=max_unrealized_profit,
min_unrealized_profit=min_unrealized_profit,
position=position,
risk_reward_ratio,
short_allowed=short_allowed,
action_masking=action_masking,
+ previous_potential=last_potential,
)
+ last_potential = breakdown.next_potential
+
+ max_idle_duration_candles = get_max_idle_duration_candles(params)
+ idle_ratio = context.idle_duration / max(1, max_idle_duration_candles)
+
samples.append(
{
"pnl": context.pnl,
"trade_duration": context.trade_duration,
"idle_duration": context.idle_duration,
- "duration_ratio": context.trade_duration / max(1, max_trade_duration),
- "idle_ratio": context.idle_duration / max(1, max_trade_duration),
+ "duration_ratio": _compute_duration_ratio(
+ context.trade_duration, max_trade_duration_candles
+ ),
+ "idle_ratio": idle_ratio,
"position": float(context.position.value),
- "action": float(context.action.value),
- "reward_total": breakdown.total,
+ "action": int(context.action.value),
+ "reward": breakdown.total,
"reward_invalid": breakdown.invalid_penalty,
"reward_idle": breakdown.idle_penalty,
"reward_hold": breakdown.hold_penalty,
"reward_exit": breakdown.exit_component,
# PBRS components
- "reward_shaping": breakdown.shaping_reward,
+ "reward_shaping": breakdown.reward_shaping,
"reward_entry_additive": breakdown.entry_additive,
"reward_exit_additive": breakdown.exit_additive,
- "current_potential": breakdown.current_potential,
+ "prev_potential": breakdown.prev_potential,
"next_potential": breakdown.next_potential,
"is_invalid": float(breakdown.invalid_penalty != 0.0),
+ "pbrs_invariant": bool(pbrs_invariant),
}
)
df = pd.DataFrame(samples)
+ # Enforce PBRS invariance: zero-sum shaping under canonical mode and no additives
+ try:
+ exit_mode = _get_str_param(
+ params,
+ "exit_potential_mode",
+ str(DEFAULT_MODEL_REWARD_PARAMETERS.get("exit_potential_mode", "canonical")),
+ )
+ entry_enabled = _get_bool_param(
+ params,
+ "entry_additive_enabled",
+ bool(DEFAULT_MODEL_REWARD_PARAMETERS.get("entry_additive_enabled", False)),
+ )
+ exit_enabled = _get_bool_param(
+ params,
+ "exit_additive_enabled",
+ bool(DEFAULT_MODEL_REWARD_PARAMETERS.get("exit_additive_enabled", False)),
+ )
+ if exit_mode == "canonical" and not (entry_enabled or exit_enabled):
+ if "reward_shaping" in df.columns:
+ total_shaping = float(df["reward_shaping"].sum())
+ if abs(total_shaping) > PBRS_INVARIANCE_TOL:
+ # Drift correction distributes a constant offset across invariant samples
+ n_invariant = (
+ int(df["pbrs_invariant"].sum())
+ if "pbrs_invariant" in df.columns
+ else int(len(df))
+ )
+ drift = total_shaping / max(1, n_invariant)
+ df.loc[:, "reward_shaping"] = df["reward_shaping"] - drift
+ # Attach resolved reward params for downstream consumers (e.g., report derivations)
+ df.attrs["reward_params"] = dict(params)
+ except Exception:
+ # Graceful fallback (no invariance enforcement on failure)
+ pass
+
# Validate critical algorithmic invariants
_validate_simulation_invariants(df)
exit_action_mask = df["action"].isin([2.0, 4.0])
exit_pnl_sum = df.loc[exit_action_mask, "pnl"].sum()
+ # Tolerances from INTERNAL_GUARDS to handle backend/OS numeric epsilons
+ tol_pnl = float(INTERNAL_GUARDS.get("sim_pnl_conservation_tol", 1e-10))
+ eps_pnl = float(INTERNAL_GUARDS.get("sim_zero_pnl_epsilon", 1e-12))
+ eps_reward = float(INTERNAL_GUARDS.get("sim_zero_reward_epsilon", 1e-12))
+ thr_extreme = float(INTERNAL_GUARDS.get("sim_extreme_pnl_threshold", 0.2))
+
pnl_diff = abs(total_pnl - exit_pnl_sum)
- if pnl_diff > 1e-10:
+ if pnl_diff > tol_pnl:
raise AssertionError(
f"PnL INVARIANT VIOLATION: Total PnL ({total_pnl:.6f}) != "
f"Exit PnL sum ({exit_pnl_sum:.6f}), difference = {pnl_diff:.2e}"
)
# INVARIANT 2: PnL Exclusivity - Only exit actions should have non-zero PnL
- non_zero_pnl_actions = set(df[df["pnl"] != 0]["action"].unique())
+ non_zero_pnl_actions = set(df[df["pnl"].abs() > eps_pnl]["action"].unique())
valid_exit_actions = {2.0, 4.0}
invalid_actions = non_zero_pnl_actions - valid_exit_actions
if invalid_actions:
)
# INVARIANT 3: Exit Reward Consistency - Non-zero exit rewards require non-zero PnL
- inconsistent_exits = df[(df["pnl"] == 0) & (df["reward_exit"] != 0)]
+ inconsistent_exits = df[(df["pnl"].abs() <= eps_pnl) & (df["reward_exit"].abs() > eps_reward)]
if len(inconsistent_exits) > 0:
raise AssertionError(
f"EXIT REWARD INCONSISTENCY: {len(inconsistent_exits)} actions have "
)
# INVARIANT 6: Bounded Values - Check realistic bounds
- extreme_pnl = df[(df["pnl"].abs() > 0.2)] # Beyond reasonable range
+ extreme_pnl = df[(df["pnl"].abs() > thr_extreme)] # Beyond reasonable range
if len(extreme_pnl) > 0:
max_abs_pnl = df["pnl"].abs().max()
raise AssertionError(
def _compute_summary_stats(df: pd.DataFrame) -> Dict[str, Any]:
"""Compute summary statistics without writing to file."""
- action_summary = df.groupby("action")["reward_total"].agg(
- ["count", "mean", "std", "min", "max"]
- )
+ action_summary = df.groupby("action")["reward"].agg(["count", "mean", "std", "min", "max"])
component_share = df[
[
"reward_invalid",
"reward_idle",
"reward_hold",
"reward_exit",
- "reward_total",
+ "reward",
]
component_bounds = (
df[components]
.round(6)
)
- global_stats = df["reward_total"].describe(
- percentiles=[0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99]
- )
+ global_stats = df["reward"].describe(percentiles=[0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99])
return {
"global_stats": global_stats,
return aggregated
-def _compute_relationship_stats(
- df: pd.DataFrame, max_trade_duration: int
-) -> Dict[str, Any]:
+def _compute_relationship_stats(df: pd.DataFrame) -> Dict[str, Any]:
"""Return binned stats dict for idle, trade duration and pnl (uniform bins)."""
- idle_bins = np.linspace(0, max_trade_duration * 3.0, 13)
- trade_bins = np.linspace(0, max_trade_duration * 3.0, 13)
+ reward_params: RewardParams = (
+ dict(df.attrs.get("reward_params"))
+ if isinstance(df.attrs.get("reward_params"), dict)
+ else {}
+ )
+ max_trade_duration_candles = _get_int_param(
+ reward_params,
+ "max_trade_duration_candles",
+ DEFAULT_MODEL_REWARD_PARAMETERS.get("max_trade_duration_candles", 128),
+ )
+ idle_bins = np.linspace(0, max_trade_duration_candles * 3.0, 13)
+ trade_bins = np.linspace(0, max_trade_duration_candles * 3.0, 13)
pnl_min = float(df["pnl"].min())
pnl_max = float(df["pnl"].max())
if np.isclose(pnl_min, pnl_max):
exit_stats = exit_stats.round(6)
correlation_fields = [
- "reward_total",
+ "reward",
"reward_invalid",
"reward_idle",
"reward_hold",
]
# Drop columns that are constant (std == 0) to avoid all-NaN correlation rows
numeric_subset = df[correlation_fields]
- constant_cols = [
- c for c in numeric_subset.columns if numeric_subset[c].nunique() <= 1
- ]
+ constant_cols = [c for c in numeric_subset.columns if numeric_subset[c].nunique() <= 1]
if constant_cols:
filtered = numeric_subset.drop(columns=constant_cols)
else:
def _compute_representativity_stats(
- df: pd.DataFrame, profit_target: float, max_trade_duration: int | None = None
+ df: pd.DataFrame,
+ profit_target: float,
) -> Dict[str, Any]:
- """Compute representativity statistics for the reward space.
-
- NOTE: The max_trade_duration parameter is reserved for future duration coverage metrics.
- """
+ """Compute representativity statistics for the reward space."""
total = len(df)
# Map numeric position codes to readable labels to avoid casting Neutral (0.5) to 0
pos_label_map = {0.0: "Short", 0.5: "Neutral", 1.0: "Long"}
pos_labeled = df["position"].map(pos_label_map)
pos_counts = (
- pos_labeled.value_counts()
- .reindex(["Short", "Neutral", "Long"])
- .fillna(0)
- .astype(int)
+ pos_labeled.value_counts().reindex(["Short", "Neutral", "Long"]).fillna(0).astype(int)
)
# Actions are encoded as float enum values, casting to int is safe here
act_counts = df["action"].astype(int).value_counts().sort_index()
def _perform_feature_analysis(
- df: pd.DataFrame, seed: int, *, skip_partial_dependence: bool = False
-) -> Tuple[
- pd.DataFrame, Dict[str, Any], Dict[str, pd.DataFrame], RandomForestRegressor
-]:
+ df: pd.DataFrame,
+ seed: int,
+ *,
+ skip_partial_dependence: bool = False,
+ rf_n_jobs: int = 1,
+ perm_n_jobs: int = 1,
+) -> Tuple[pd.DataFrame, Dict[str, Any], Dict[str, pd.DataFrame], RandomForestRegressor]:
"""Run RandomForest-based feature analysis.
Returns
model : RandomForestRegressor
Fitted model instance (for optional downstream inspection).
"""
+ # Ensure sklearn is available
+ if (
+ RandomForestRegressor is None
+ or train_test_split is None
+ or permutation_importance is None
+ or r2_score is None
+ ):
+ raise ImportError("scikit-learn is not available; skipping feature analysis.")
feature_cols = [
"pnl",
"trade_duration",
"action",
"is_invalid",
]
- X = df[feature_cols]
+ X = df[feature_cols].copy()
for col in ("trade_duration", "idle_duration"):
if col in X.columns and pd.api.types.is_integer_dtype(X[col]):
X.loc[:, col] = X[col].astype(float)
- y = df["reward_total"]
- X_train, X_test, y_train, y_test = train_test_split(
- X, y, test_size=0.25, random_state=seed
- )
+ y = df["reward"]
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=seed)
# Canonical RandomForest configuration - single source of truth
model = RandomForestRegressor(
n_estimators=400,
max_depth=None,
random_state=seed,
- n_jobs=1,
+ n_jobs=rf_n_jobs,
)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_test,
n_repeats=25,
random_state=seed,
- n_jobs=1,
+ n_jobs=perm_n_jobs,
)
importance_df = (
value_key = "values" if "values" in pd_result else "grid_values"
values = pd_result[value_key][0]
averaged = pd_result["average"][0]
- partial_deps[feature] = pd.DataFrame(
- {feature: values, "partial_dependence": averaged}
- )
+ partial_deps[feature] = pd.DataFrame({feature: values, "partial_dependence": averaged})
analysis_stats = {
"r2_score": r2,
"idle_duration",
"position",
"action",
- "reward_total",
+ "reward",
}
# Keep optional list stable and explicit
# Additive / shaping components
"reward_entry_additive",
"reward_exit_additive",
- "current_potential",
+ "prev_potential",
"next_potential",
"is_invalid",
}
"idle_duration",
"position",
"action",
- "reward_total",
+ "reward",
}
missing_required = required - set(df.columns)
if missing_required:
# JS distance must be in [0, 1]
if "js_distance" in key:
if not (0 <= value <= 1):
- raise AssertionError(
- f"JS distance {key} must be in [0,1], got {value:.6f}"
- )
+ raise AssertionError(f"JS distance {key} must be in [0,1], got {value:.6f}")
# Wasserstein distance must be >= 0
if "wasserstein" in key and value < 0:
- raise AssertionError(
- f"Wasserstein distance {key} must be >= 0, got {value:.6f}"
- )
+ raise AssertionError(f"Wasserstein distance {key} must be >= 0, got {value:.6f}")
# KS statistic must be in [0, 1]
if "ks_statistic" in key:
if not (0 <= value <= 1):
- raise AssertionError(
- f"KS statistic {key} must be in [0,1], got {value:.6f}"
- )
+ raise AssertionError(f"KS statistic {key} must be in [0,1], got {value:.6f}")
# p-values must be in [0, 1]
if "pvalue" in key:
# Test 2: Position reward differences
position_groups = [
- df[df["position"] == pos]["reward_total"].dropna().values
- for pos in df["position"].unique()
+ df[df["position"] == pos]["reward"].dropna().values for pos in df["position"].unique()
]
position_groups = [g for g in position_groups if len(g) >= 10]
}
# Test 3: PnL sign differences
- pnl_positive = df[df["pnl"] > 0]["reward_total"].dropna()
- pnl_negative = df[df["pnl"] < 0]["reward_total"].dropna()
+ pnl_positive = df[df["pnl"] > 0]["reward"].dropna()
+ pnl_negative = df[df["pnl"] < 0]["reward"].dropna()
if len(pnl_positive) >= 30 and len(pnl_negative) >= 30:
u_stat, p_val = stats.mannwhitneyu(pnl_positive, pnl_negative)
# Optional multiple testing correction (Benjamini-Hochberg)
if adjust_method not in {"none", "benjamini_hochberg"}:
- raise ValueError(
- "Unsupported adjust_method. Use 'none' or 'benjamini_hochberg'."
- )
+ raise ValueError("Unsupported adjust_method. Use 'none' or 'benjamini_hochberg'.")
if adjust_method == "benjamini_hochberg" and results:
# Collect p-values
items = list(results.items())
rho = result["rho"]
if np.isfinite(rho) and not (-1 <= rho <= 1):
raise AssertionError(
- f"Invalid correlation coefficient for {test_name}: {rho:.6f} "
- f"not in [-1,1]"
+ f"Invalid correlation coefficient for {test_name}: {rho:.6f} not in [-1,1]"
)
# Confidence intervals must be properly ordered
width = ci_high - ci_low
if width <= 0:
if strict_diagnostics:
- raise AssertionError(
- f"Bootstrap CI for {metric}: non-positive width {width:.6f}"
- )
+ raise AssertionError(f"Bootstrap CI for {metric}: non-positive width {width:.6f}")
# Graceful mode: expand interval symmetrically
if width == 0:
epsilon = INTERNAL_GUARDS["degenerate_ci_epsilon"]
def distribution_diagnostics(
- df: pd.DataFrame, *, seed: int | None = None, strict_diagnostics: bool = False
+ df: pd.DataFrame,
+ *,
+ seed: int | None = None,
+ strict_diagnostics: bool = False,
) -> Dict[str, Any]:
"""Return mapping col-> diagnostics (tests, moments, entropy, divergences).
diagnostics = {}
_ = seed # placeholder to keep signature for future reproducibility extensions
- for col in ["reward_total", "pnl", "trade_duration", "idle_duration"]:
+ for col in ["reward", "pnl", "trade_duration", "idle_duration"]:
if col not in df.columns:
continue
ad_result = stats.anderson(data, dist="norm")
diagnostics[f"{col}_anderson_stat"] = float(ad_result.statistic)
- diagnostics[f"{col}_anderson_critical_5pct"] = float(
- ad_result.critical_values[2]
- )
+ diagnostics[f"{col}_anderson_critical_5pct"] = float(ad_result.critical_values[2])
diagnostics[f"{col}_is_normal_anderson"] = bool(
ad_result.statistic < ad_result.critical_values[2]
)
- from scipy.stats import probplot
-
(_osm, _osr), (_slope, _intercept, r) = probplot(data, dist="norm", plot=None)
diagnostics[f"{col}_qq_r_squared"] = float(r**2)
- _validate_distribution_diagnostics(
- diagnostics, strict_diagnostics=strict_diagnostics
- )
+ _validate_distribution_diagnostics(diagnostics, strict_diagnostics=strict_diagnostics)
return diagnostics
-def _validate_distribution_diagnostics(
- diag: Dict[str, Any], *, strict_diagnostics: bool
-) -> None:
+def _validate_distribution_diagnostics(diag: Dict[str, Any], *, strict_diagnostics: bool) -> None:
"""Validate mathematical properties of distribution diagnostics.
Ensures all reported statistics are finite and within theoretical bounds where applicable.
for prefix in zero_var_columns
)
if constant_problem and not strict_diagnostics:
- fallback = INTERNAL_GUARDS.get(
- "distribution_constant_fallback_moment", 0.0
- )
+ fallback = INTERNAL_GUARDS.get("distribution_constant_fallback_moment", 0.0)
diag[key] = fallback
warnings.warn(
f"Replaced undefined {key} (constant distribution) with {fallback}",
RewardDiagnosticsWarning,
)
else:
- raise AssertionError(
- f"Distribution diagnostic {key} is not finite: {value}"
- )
+ raise AssertionError(f"Distribution diagnostic {key} is not finite: {value}")
if key.endswith("_shapiro_pval"):
if not (0 <= value <= 1):
- raise AssertionError(
- f"Shapiro p-value {key} must be in [0,1], got {value}"
- )
+ raise AssertionError(f"Shapiro p-value {key} must be in [0,1], got {value}")
if key.endswith("_anderson_stat") or key.endswith("_anderson_critical_5pct"):
if not np.isfinite(value):
prefix = key.rsplit("_", 2)[0]
if prefix in zero_var_columns and not strict_diagnostics:
- fallback = INTERNAL_GUARDS.get(
- "distribution_constant_fallback_moment", 0.0
- )
+ fallback = INTERNAL_GUARDS.get("distribution_constant_fallback_moment", 0.0)
diag[key] = fallback
warnings.warn(
f"Replaced undefined Anderson diagnostic {key} (constant distribution) with {fallback}",
RewardDiagnosticsWarning,
)
continue
- raise AssertionError(
- f"Anderson statistic {key} must be finite, got {value}"
- )
+ raise AssertionError(f"Anderson statistic {key} must be finite, got {value}")
if key.endswith("_qq_r_squared"):
- if not (
- isinstance(value, (int, float))
- and np.isfinite(value)
- and 0 <= value <= 1
- ):
+ if not (isinstance(value, (int, float)) and np.isfinite(value) and 0 <= value <= 1):
prefix = key[: -len("_qq_r_squared")]
if prefix in zero_var_columns and not strict_diagnostics:
- fallback_r2 = INTERNAL_GUARDS.get(
- "distribution_constant_fallback_qq_r2", 1.0
- )
+ fallback_r2 = INTERNAL_GUARDS.get("distribution_constant_fallback_qq_r2", 1.0)
diag[key] = fallback_r2
warnings.warn(
f"Replaced undefined Q-Q R^2 {key} (constant distribution) with {fallback_r2}",
# === PBRS IMPLEMENTATION ===
-def _compute_hold_potential(
- pnl: float, duration_ratio: float, params: RewardParams
-) -> float:
+def _compute_hold_potential(pnl: float, duration_ratio: float, params: RewardParams) -> float:
"""Compute PBRS hold potential Φ(s)."""
if not _get_bool_param(
params,
)
-def _compute_entry_additive(
- pnl: float, duration_ratio: float, params: RewardParams
-) -> float:
+def _compute_entry_additive(pnl: float, duration_ratio: float, params: RewardParams) -> float:
if not _get_bool_param(
params,
"entry_additive_enabled",
)
-def _compute_exit_additive(
- pnl: float, duration_ratio: float, params: RewardParams
-) -> float:
+def _compute_exit_additive(pnl: float, duration_ratio: float, params: RewardParams) -> float:
if not _get_bool_param(
params,
"exit_additive_enabled",
"exit_potential_decay",
DEFAULT_MODEL_REWARD_PARAMETERS.get("exit_potential_decay", 0.5),
)
- if not np.isfinite(decay):
- warnings.warn(
- "exit_potential_decay invalid (NaN/inf); defaulting to 0.5",
- RewardDiagnosticsWarning,
- stacklevel=2,
- )
- decay = 0.5
- if decay < 0.0:
+ if not np.isfinite(decay) or decay < 0.0:
warnings.warn(
- f"exit_potential_decay={decay} < 0; clamped to 0.0",
+ "exit_potential_decay invalid or < 0; falling back to 0.0",
RewardDiagnosticsWarning,
stacklevel=2,
)
decay = 0.0
if decay > 1.0:
warnings.warn(
- f"exit_potential_decay={decay} > 1; clamped to 1.0",
+ f"exit_potential_decay={decay} > 1; falling back to 1.0",
RewardDiagnosticsWarning,
stacklevel=2,
)
current_duration_ratio: float,
next_pnl: float,
next_duration_ratio: float,
- is_terminal: bool,
- last_potential: float,
params: RewardParams,
+ is_exit: bool = False,
+ is_entry: bool = False,
+ previous_potential: float = np.nan,
+ last_potential: Optional[float] = None,
) -> tuple[float, float, float]:
- """Compute shaped reward: base + γΦ' - Φ plus (entry/exit) additives (if enabled)."""
+ """Compute shaped reward with explicit PBRS semantics.
+
+ Notes
+ -----
+ - Shaping Δ = γ·Φ(next) − Φ(prev) with prev = Φ(current_pnl, current_duration_ratio).
+ - previous_potential:
+ Previously computed Φ(s) for the prior transition. When provided and finite, it
+ is used as Φ(prev) in Δ; otherwise Φ(prev) is derived from the current state.
+ - last_potential:
+ Potential used to compute terminal Φ′ at exit via _compute_exit_potential().
+ Fallback logic: if last_potential is None or non-finite, then last_potential := previous_potential
+ (or the derived prev term) to preserve telescoping semantics.
+ - Entry additive is applied only on entry transitions (based on next_* metrics).
+ - Exit additive is applied only on exit transitions (based on current_* metrics).
+ - Canonical invariance: when exit_potential_mode == 'canonical' and additives are disabled,
+ the telescoping sum ensures Σ reward_shaping ≈ 0 across a complete episode.
+ """
params = _enforce_pbrs_invariance(params)
gamma = _get_potential_gamma(params)
- current_potential = _compute_hold_potential(
- current_pnl, current_duration_ratio, params
- )
- if is_terminal:
+
+ # Use provided previous_potential when finite; otherwise derive from current state
+ prev_term = (
+ float(previous_potential)
+ if np.isfinite(previous_potential)
+ else _compute_hold_potential(current_pnl, current_duration_ratio, params)
+ )
+ if not np.isfinite(prev_term):
+ prev_term = 0.0
+
+ # Next potential per transition type
+ if is_exit:
+ # Exit potential is derived from the last potential if provided; otherwise from Φ(prev) (prev_term)
+ last_potential = (
+ float(last_potential)
+ if (last_potential is not None and np.isfinite(last_potential))
+ else float(prev_term)
+ )
next_potential = _compute_exit_potential(last_potential, params)
else:
next_potential = _compute_hold_potential(next_pnl, next_duration_ratio, params)
- shaping_reward = gamma * next_potential - current_potential
- entry_additive = _compute_entry_additive(
- current_pnl, current_duration_ratio, params
- )
- exit_additive = (
- _compute_exit_additive(next_pnl, next_duration_ratio, params)
- if is_terminal
- else 0.0
- )
- total_reward = base_reward + shaping_reward + entry_additive + exit_additive
- if not np.isfinite(total_reward):
+
+ # PBRS shaping Δ = γ·Φ(next) − Φ(prev)
+ reward_shaping = gamma * next_potential - float(prev_term)
+
+ # Non-PBRS additives
+ # Pre-compute candidate additives (return 0.0 if corresponding feature disabled)
+ cand_entry_add = _compute_entry_additive(next_pnl, next_duration_ratio, params)
+ cand_exit_add = _compute_exit_additive(current_pnl, current_duration_ratio, params)
+
+ entry_additive = cand_entry_add if is_entry else 0.0
+ exit_additive = cand_exit_add if is_exit else 0.0
+
+ reward = base_reward + reward_shaping + entry_additive + exit_additive
+ if not np.isfinite(reward):
return float(base_reward), 0.0, 0.0
- if np.isclose(shaping_reward, 0.0):
- shaping_reward = 0.0
- return float(total_reward), float(shaping_reward), float(next_potential)
+ if np.isclose(reward_shaping, 0.0):
+ reward_shaping = 0.0
+ return float(reward), float(reward_shaping), float(next_potential)
def _enforce_pbrs_invariance(params: RewardParams) -> RewardParams:
"exit_additive_enabled",
bool(DEFAULT_MODEL_REWARD_PARAMETERS.get("exit_additive_enabled", False)),
)
+ # Strict canonical enforcement
if entry_enabled:
warnings.warn(
"Disabling entry additive to preserve PBRS invariance (canonical mode).",
help="Skip partial dependence computation to speed up analysis.",
)
parser.add_argument(
- "--stats_seed",
+ "--rf_n_jobs",
type=int,
- default=None,
- help="Optional separate seed for statistical analyses (default: same as --seed).",
+ default=-1,
+ help="Number of parallel jobs for RandomForestRegressor (default: -1 for all CPUs).",
+ )
+ parser.add_argument(
+ "--perm_n_jobs",
+ type=int,
+ default=-1,
+ help="Number of parallel jobs for permutation_importance (default: -1 for all CPUs).",
)
parser.add_argument(
- "--max_trade_duration",
+ "--stats_seed",
type=int,
- default=128,
- help="Configured trade timeout in candles (default: 128).",
+ default=None,
+ help="Optional separate seed for statistical analyses (default: same as --seed).",
)
parser.add_argument(
"--base_factor",
)
parser.add_argument(
"--action_masking",
- type=str,
- choices=["true", "false", "1", "0", "yes", "no"],
- default="true",
- help="Enable action masking simulation (default: true).",
+ dest="action_masking",
+ action="store_true",
+ default=True,
+ help="Enable action masking simulation (default: enabled).",
)
parser.add_argument(
"--out_dir",
"skewness/kurtosis/Anderson/Q-Q metrics produced by constant distributions instead of applying graceful replacements."
),
)
+ parser.add_argument(
+ "--strict_validation",
+ dest="strict_validation",
+ action="store_true",
+ default=True,
+ help="Enable strict parameter validation (raise on out-of-bounds or non-finite reward parameters). Default: enabled.",
+ )
parser.add_argument(
"--bootstrap_resamples",
type=int,
"Lower this (e.g. 200-1000) for faster smoke tests; increase for more stable CI width estimates."
),
)
+ parser.add_argument(
+ "--unrealized_pnl",
+ action="store_true",
+ help="Simulate unrealized PnL during holds to feed Φ(s) (optional; default: disabled).",
+ )
return parser
def write_complete_statistical_analysis(
df: pd.DataFrame,
output_dir: Path,
- max_trade_duration: int,
profit_target: float,
seed: int,
real_df: Optional[pd.DataFrame] = None,
bootstrap_resamples: int = 10000,
skip_partial_dependence: bool = False,
skip_feature_analysis: bool = False,
+ rf_n_jobs: int = -1,
+ perm_n_jobs: int = -1,
) -> None:
- """Generate a single comprehensive statistical analysis report with enhanced tests."""
+ """Generate a single comprehensive statistical analysis report."""
output_dir.mkdir(parents=True, exist_ok=True)
report_path = output_dir / "statistical_analysis.md"
+ reward_params: RewardParams = (
+ dict(df.attrs.get("reward_params"))
+ if isinstance(df.attrs.get("reward_params"), dict)
+ else {}
+ )
+ max_trade_duration_candles = _get_int_param(
+ reward_params,
+ "max_trade_duration_candles",
+ DEFAULT_MODEL_REWARD_PARAMETERS.get("max_trade_duration_candles", 128),
+ )
+
# Helpers: consistent Markdown table renderers
def _fmt_val(v: Any, ndigits: int = 6) -> str:
try:
- if isinstance(v, (int, np.integer)):
+ if isinstance(v, numbers.Integral):
return f"{int(v)}"
- if isinstance(v, (float, np.floating)):
- if np.isnan(v):
+ elif isinstance(v, numbers.Real):
+ fv = float(v)
+ if math.isnan(fv):
return "NaN"
- return f"{float(v):.{ndigits}f}"
+ return f"{fv:.{ndigits}f}"
return str(v)
except Exception:
return str(v)
- def _series_to_md(
- series: pd.Series, value_name: str = "value", ndigits: int = 6
- ) -> str:
+ def _series_to_md(series: pd.Series, value_name: str = "value", ndigits: int = 6) -> str:
lines = [f"| Metric | {value_name} |", "|--------|-----------|"]
for k, v in series.items():
lines.append(f"| {k} | {_fmt_val(v, ndigits)} |")
return "\n".join(lines) + "\n\n"
def _df_to_md(df: pd.DataFrame, index_name: str = "index", ndigits: int = 6) -> str:
- if df is None or df.empty:
+ if df.empty:
return "_No data._\n\n"
# Prepare header
cols = list(df.columns)
sep += "|" + "-" * (len(str(c)) + 2)
sep += "|\n"
# Rows
- rows = []
+ rows: List[str] = []
for idx, row in df.iterrows():
vals = [_fmt_val(row[c], ndigits) for c in cols]
rows.append("| " + str(idx) + " | " + " | ".join(vals) + " |")
# Compute all statistics
summary_stats = _compute_summary_stats(df)
- relationship_stats = _compute_relationship_stats(df, max_trade_duration)
- representativity_stats = _compute_representativity_stats(
- df, profit_target, max_trade_duration
- )
+ relationship_stats = _compute_relationship_stats(df)
+ representativity_stats = _compute_representativity_stats(df, profit_target)
# Model analysis: skip if requested or not enough samples
importance_df = None
partial_deps = {}
if skip_feature_analysis or len(df) < 4:
print("Skipping feature analysis: flag set or insufficient samples (<4).")
- else:
- importance_df, analysis_stats, partial_deps, _model = _perform_feature_analysis(
- df, seed, skip_partial_dependence=skip_partial_dependence
+ # Create placeholder files to satisfy integration expectations
+ (output_dir / "feature_importance.csv").write_text(
+ "feature,importance_mean,importance_std\n", encoding="utf-8"
)
- # Save feature importance CSV
- importance_df.to_csv(output_dir / "feature_importance.csv", index=False)
- # Save partial dependence CSVs
- if not skip_partial_dependence:
- for feature, pd_df in partial_deps.items():
- pd_df.to_csv(
- output_dir / f"partial_dependence_{feature}.csv",
- index=False,
+ for feature in ["trade_duration", "idle_duration", "pnl"]:
+ (output_dir / f"partial_dependence_{feature}.csv").write_text(
+ f"{feature},partial_dependence\n", encoding="utf-8"
+ )
+ else:
+ try:
+ importance_df, analysis_stats, partial_deps, _model = _perform_feature_analysis(
+ df,
+ seed,
+ skip_partial_dependence=skip_partial_dependence,
+ rf_n_jobs=rf_n_jobs if isinstance(rf_n_jobs, int) else 1,
+ perm_n_jobs=perm_n_jobs if isinstance(perm_n_jobs, int) else 1,
+ )
+ # Save feature importance CSV
+ importance_df.to_csv(output_dir / "feature_importance.csv", index=False)
+ # Save partial dependence CSVs
+ if not skip_partial_dependence:
+ for feature, pd_df in partial_deps.items():
+ pd_df.to_csv(
+ output_dir / f"partial_dependence_{feature}.csv",
+ index=False,
+ )
+ else:
+ # Create empty files to keep outputs stable
+ for feature in ["trade_duration", "idle_duration", "pnl"]:
+ (output_dir / f"partial_dependence_{feature}.csv").write_text(
+ f"{feature},partial_dependence\n", encoding="utf-8"
+ )
+ except ImportError:
+ print("scikit-learn unavailable; generating placeholder analysis artifacts.")
+ (output_dir / "feature_importance.csv").write_text(
+ "feature,importance_mean,importance_std\n", encoding="utf-8"
+ )
+ for feature in ["trade_duration", "idle_duration", "pnl"]:
+ (output_dir / f"partial_dependence_{feature}.csv").write_text(
+ f"{feature},partial_dependence\n", encoding="utf-8"
)
# Enhanced statistics
test_seed = (
- stats_seed
- if isinstance(stats_seed, int)
- else (seed if isinstance(seed, int) else 42)
- )
- hypothesis_tests = statistical_hypothesis_tests(
- df, adjust_method=adjust_method, seed=test_seed
+ stats_seed if isinstance(stats_seed, int) else (seed if isinstance(seed, int) else 42)
)
+ hypothesis_tests = statistical_hypothesis_tests(df, adjust_method=adjust_method, seed=test_seed)
metrics_for_ci = [
- "reward_total",
+ "reward",
"reward_idle",
"reward_hold",
"reward_exit",
"pnl",
]
+ # Include PBRS-related metrics when present
+ extra_ci_cols = [
+ col
+ for col in ["reward_shaping", "reward_entry_additive", "reward_exit_additive"]
+ if col in df.columns
+ ]
+ metrics_for_ci.extend(extra_ci_cols)
bootstrap_ci = bootstrap_confidence_intervals(
df,
metrics_for_ci,
f.write(f"| Generated | {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')} |\n")
f.write(f"| Total Samples | {len(df):,} |\n")
f.write(f"| Random Seed | {seed} |\n")
- f.write(f"| Max Trade Duration | {max_trade_duration} |\n")
# Blank separator to visually group core simulation vs PBRS parameters
f.write("| | |\n")
- # Extra core PBRS parameters exposed in run configuration if present
- _rp = (
- df.attrs.get("reward_params")
+ # Core PBRS parameters exposed in run configuration if present
+ reward_params: RewardParams = (
+ dict(df.attrs.get("reward_params"))
if isinstance(df.attrs.get("reward_params"), dict)
else {}
)
exit_mode = _get_str_param(
- _rp,
+ reward_params,
"exit_potential_mode",
DEFAULT_MODEL_REWARD_PARAMETERS.get("exit_potential_mode", "canonical"),
)
- potential_gamma = _rp.get(
- "potential_gamma",
- DEFAULT_MODEL_REWARD_PARAMETERS.get(
- "potential_gamma", POTENTIAL_GAMMA_DEFAULT
- ),
- )
+ potential_gamma = _get_potential_gamma(reward_params)
f.write(f"| exit_potential_mode | {exit_mode} |\n")
f.write(f"| potential_gamma | {potential_gamma} |\n")
+ # Additional configuration details
+ f.write(f"| max_trade_duration_candles | {max_trade_duration_candles} |\n")
+ max_idle_duration_candles = get_max_idle_duration_candles(
+ reward_params, max_trade_duration_candles=max_trade_duration_candles
+ )
+ f.write(f"| max_idle_duration_candles | {max_idle_duration_candles} |\n")
+ f.write(f"| strict_diagnostics | {strict_diagnostics} |\n")
+ f.write(f"| skip_feature_analysis | {skip_feature_analysis} |\n")
+ f.write(f"| skip_partial_dependence | {skip_partial_dependence} |\n")
+ f.write(f"| rf_n_jobs | {rf_n_jobs} |\n")
+ f.write(f"| perm_n_jobs | {perm_n_jobs} |\n")
+ f.write(f"| bootstrap_resamples | {bootstrap_resamples} |\n")
+ f.write(f"| pvalue_adjust_method | {adjust_method} |\n")
# Blank separator before overrides block
f.write("| | |\n")
- overrides_pairs = []
- if _rp:
+ overrides_pairs: List[str] = []
+ if reward_params:
for k, default_v in DEFAULT_MODEL_REWARD_PARAMETERS.items():
if k in ("exit_potential_mode", "potential_gamma"):
continue # already printed explicitly
try:
- if k in _rp and _rp[k] != default_v:
- overrides_pairs.append(f"{k}={_rp[k]}")
+ if k in reward_params and reward_params[k] != default_v:
+ overrides_pairs.append(f"{k}={reward_params[k]}")
except Exception:
continue
if overrides_pairs:
f.write("## 1. Global Statistics\n\n")
f.write("### 1.1 Reward Distribution\n\n")
- f.write(
- _series_to_md(
- summary_stats["global_stats"], value_name="reward_total", ndigits=6
- )
- )
+ f.write(_series_to_md(summary_stats["global_stats"], value_name="reward", ndigits=6))
f.write("### 1.2 Reward Statistics by Action\n\n")
action_df = summary_stats["action_summary"].copy()
f.write("### 1.3 Component Activation Rates\n\n")
f.write("Percentage of samples where each reward component is non-zero:\n\n")
comp_share = summary_stats["component_share"].copy()
- formatted_rows = [
+ formatted_rows: List[str] = [
"| Component | Activation Rate |",
"|-----------|----------------|",
]
# Section 2: Representativity Analysis
f.write("---\n\n")
f.write("## 2. Sample Representativity\n\n")
- f.write(
- "This section evaluates whether the synthetic samples adequately represent "
- )
+ f.write("This section evaluates whether the synthetic samples adequately represent ")
f.write("the full reward space across different market scenarios.\n\n")
f.write("### 2.1 Position Distribution\n\n")
f.write(
_series_to_md(
- representativity_stats["pos_counts"], value_name="count", ndigits=0
+ representativity_stats["pos_counts"],
+ value_name="count",
+ ndigits=0,
)
)
f.write("### 2.2 Action Distribution\n\n")
f.write(
_series_to_md(
- representativity_stats["act_counts"], value_name="count", ndigits=0
+ representativity_stats["act_counts"],
+ value_name="count",
+ ndigits=0,
)
)
f.write("### 2.3 Critical Regime Coverage\n\n")
f.write("| Regime | Coverage |\n")
f.write("|--------|----------|\n")
- f.write(
- f"| PnL > target | {representativity_stats['pnl_above_target']:.1%} |\n"
- )
- f.write(
- f"| PnL near target (±20%) | {representativity_stats['pnl_near_target']:.1%} |\n"
- )
+ f.write(f"| PnL > target | {representativity_stats['pnl_above_target']:.1%} |\n")
+ f.write(f"| PnL near target (±20%) | {representativity_stats['pnl_near_target']:.1%} |\n")
f.write(
f"| Duration overage (>1.0) | {representativity_stats['duration_overage_share']:.1%} |\n"
)
- f.write(
- f"| Extreme PnL (\\|pnl\\|≥0.14) | {representativity_stats['pnl_extreme']:.1%} |\n"
- )
+ f.write(f"| Extreme PnL (\\|pnl\\|≥0.14) | {representativity_stats['pnl_extreme']:.1%} |\n")
f.write("\n")
f.write("### 2.4 Component Activation Rates\n\n")
# Section 3: Reward Component Relationships
f.write("---\n\n")
f.write("## 3. Reward Component Analysis\n\n")
- f.write(
- "Analysis of how reward components behave under different conditions.\n\n"
- )
+ f.write("Analysis of how reward components behave under different conditions.\n\n")
f.write("### 3.1 Idle Penalty vs Duration\n\n")
if relationship_stats["idle_stats"].empty:
f.write(_df_to_md(corr_df, index_name=corr_df.index.name, ndigits=4))
_dropped = relationship_stats.get("correlation_dropped") or []
if _dropped:
- f.write(
- "\n_Constant features removed (no variance): "
- + ", ".join(_dropped)
- + "._\n\n"
- )
+ dropped_strs: List[str] = [str(x) for x in _dropped]
+ f.write("\n_Constant features removed: " + ", ".join(dropped_strs) + "._\n\n")
# Section 3.5: PBRS Analysis
f.write("### 3.5 PBRS (Potential-Based Reward Shaping) Analysis\n\n")
# PBRS statistics
f.write("**PBRS Component Statistics:**\n\n")
- pbrs_stats = df[pbrs_components].describe(
- percentiles=[0.1, 0.25, 0.5, 0.75, 0.9]
- )
- pbrs_stats_df = pbrs_stats.round(
- 6
- ).T # Transpose to make it DataFrame-compatible
+ pbrs_stats = df[pbrs_components].describe(percentiles=[0.1, 0.25, 0.5, 0.75, 0.9])
+ pbrs_stats_df = pbrs_stats.round(6).T # Transpose to make it DataFrame-compatible
pbrs_stats_df.index.name = "component"
f.write(_df_to_md(pbrs_stats_df, index_name="component", ndigits=6))
exit_add_total = df.get("reward_exit_additive", pd.Series([0])).sum()
# Get configuration for proper invariance assessment
- reward_params = (
- df.attrs.get("reward_params", {}) if hasattr(df, "attrs") else {}
- )
- exit_potential_mode = _get_str_param(
- reward_params, "exit_potential_mode", "canonical"
- )
- entry_additive_enabled = _get_bool_param(
- reward_params, "entry_additive_enabled", False
- )
- exit_additive_enabled = _get_bool_param(
- reward_params, "exit_additive_enabled", False
- )
+ reward_params = df.attrs.get("reward_params", {}) if hasattr(df, "attrs") else {}
+ exit_potential_mode = _get_str_param(reward_params, "exit_potential_mode", "canonical")
+ entry_additive_enabled = _get_bool_param(reward_params, "entry_additive_enabled", False)
+ exit_additive_enabled = _get_bool_param(reward_params, "exit_additive_enabled", False)
# True invariance requires canonical mode AND no additives
is_theoretically_invariant = exit_potential_mode == "canonical" and not (
if is_theoretically_invariant:
if shaping_near_zero:
invariance_status = "✅ Canonical"
- invariance_note = "Theoretical invariance preserved (canonical mode, no additives, Σ≈0)"
+ invariance_note = (
+ "Theoretical invariance preserved (canonical mode, no additives, Σ≈0)"
+ )
else:
invariance_status = "⚠️ Canonical (with warning)"
- invariance_note = f"Canonical mode but unexpected shaping sum = {total_shaping:.6f}"
+ invariance_note = (
+ f"Canonical mode but unexpected shaping sum = {total_shaping:.6f}"
+ )
else:
invariance_status = "❌ Non-canonical"
reasons = []
"_Note: --skip_partial_dependence is redundant when feature analysis is skipped._\n\n"
)
else:
- f.write(
- "Machine learning analysis to identify which features most influence total reward.\n\n"
- )
- f.write("**Model:** Random Forest Regressor (400 trees) \n")
- f.write(f"**R² Score:** {analysis_stats['r2_score']:.4f}\n\n")
-
- f.write("### 4.1 Top 10 Features by Importance\n\n")
- top_imp = importance_df.head(10).copy().reset_index(drop=True)
- # Render as markdown without index column
- header = "| feature | importance_mean | importance_std |\n"
- sep = "|---------|------------------|----------------|\n"
- rows = []
- for _, r in top_imp.iterrows():
- rows.append(
- f"| {r['feature']} | {_fmt_val(r['importance_mean'], 6)} | {_fmt_val(r['importance_std'], 6)} |"
+ if importance_df is None or analysis_stats is None:
+ f.write(
+ "_Feature analysis unavailable (scikit-learn not installed); placeholder artifacts generated._\n\n"
)
- f.write(header + sep + "\n".join(rows) + "\n\n")
- f.write("**Exported Data:**\n")
- f.write("- Full feature importance: `feature_importance.csv`\n")
- if not skip_partial_dependence:
- f.write("- Partial dependence plots: `partial_dependence_*.csv`\n\n")
else:
f.write(
- "- Partial dependence plots: (skipped via --skip_partial_dependence)\n\n"
+ "Machine learning analysis to identify which features most influence total reward.\n\n"
)
+ f.write("**Model:** Random Forest Regressor (400 trees) \n")
+ f.write(f"**R² Score:** {analysis_stats['r2_score']:.4f}\n\n")
+
+ f.write("### 4.1 Top 10 Features by Importance\n\n")
+ top_imp = importance_df.head(10).copy().reset_index(drop=True)
+ # Render as markdown without index column
+ header = "| feature | importance_mean | importance_std |\n"
+ sep = "|---------|------------------|----------------|\n"
+ rows: List[str] = []
+ for _, r in top_imp.iterrows():
+ rows.append(
+ f"| {r['feature']} | {_fmt_val(r['importance_mean'], 6)} | {_fmt_val(r['importance_std'], 6)} |"
+ )
+ f.write(header + sep + "\n".join(rows) + "\n\n")
+ f.write("**Exported Data:**\n")
+ f.write("- Full feature importance: `feature_importance.csv`\n")
+ if not skip_partial_dependence:
+ f.write("- Partial dependence plots: `partial_dependence_*.csv`\n\n")
+ else:
+ f.write(
+ "- Partial dependence plots: (skipped via --skip_partial_dependence)\n\n"
+ )
# Section 5: Statistical Validation
if hypothesis_tests:
f.write("---\n\n")
f.write("## 5. Statistical Validation\n\n")
- f.write(
- "Rigorous statistical tests to validate reward behavior and relationships.\n\n"
- )
+ f.write("Rigorous statistical tests to validate reward behavior and relationships.\n\n")
f.write("### 5.1 Hypothesis Tests\n\n")
f"- p-value (adj BH): {h['p_value_adj']:.4g} -> {'✅ Yes' if h['significant_adj'] else '❌ No'} (α=0.05)\n"
)
f.write(f"- 95% CI: [{h['ci_95'][0]:.4f}, {h['ci_95'][1]:.4f}]\n")
+ f.write(f"- CI width: {(h['ci_95'][1] - h['ci_95'][0]):.4f}\n")
f.write(f"- Sample size: {h['n_samples']:,}\n")
- f.write(
- f"- Significant (α=0.05): {'✅ Yes' if h['significant'] else '❌ No'}\n"
- )
+ f.write(f"- Significant (α=0.05): {'✅ Yes' if h['significant'] else '❌ No'}\n")
f.write(f"- **Interpretation:** {h['interpretation']}\n\n")
if "position_reward_difference" in hypothesis_tests:
)
f.write(f"- Effect size (ε²): {h['effect_size_epsilon_sq']:.4f}\n")
f.write(f"- Number of groups: {h['n_groups']}\n")
- f.write(
- f"- Significant (α=0.05): {'✅ Yes' if h['significant'] else '❌ No'}\n"
- )
+ f.write(f"- Significant (α=0.05): {'✅ Yes' if h['significant'] else '❌ No'}\n")
f.write(f"- **Interpretation:** {h['interpretation']} effect\n\n")
if "pnl_sign_reward_difference" in hypothesis_tests:
)
f.write(f"- Median (PnL+): {h['median_pnl_positive']:.4f}\n")
f.write(f"- Median (PnL-): {h['median_pnl_negative']:.4f}\n")
- f.write(
- f"- Significant (α=0.05): {'✅ Yes' if h['significant'] else '❌ No'}\n\n"
- )
+ f.write(f"- Significant (α=0.05): {'✅ Yes' if h['significant'] else '❌ No'}\n\n")
# Bootstrap CI
if bootstrap_ci:
if dist_diagnostics:
f.write("### 5.3 Distribution Normality Tests\n\n")
f.write("Statistical tests for normality of key distributions:\n\n")
- for col in ["reward_total", "pnl", "trade_duration"]:
+ for col in ["reward", "pnl", "trade_duration", "idle_duration"]:
if f"{col}_mean" in dist_diagnostics:
f.write(f"#### {col.replace('_', ' ').title()}\n\n")
f.write("| Metric | Value |\n")
f.write("|--------|-------|\n")
f.write(f"| Mean | {dist_diagnostics[f'{col}_mean']:.4f} |\n")
f.write(f"| Std Dev | {dist_diagnostics[f'{col}_std']:.4f} |\n")
- f.write(
- f"| Skewness | {dist_diagnostics[f'{col}_skewness']:.4f} |\n"
- )
- f.write(
- f"| Kurtosis | {dist_diagnostics[f'{col}_kurtosis']:.4f} |\n"
- )
+ f.write(f"| Skewness | {dist_diagnostics[f'{col}_skewness']:.4f} |\n")
+ f.write(f"| Kurtosis | {dist_diagnostics[f'{col}_kurtosis']:.4f} |\n")
if f"{col}_shapiro_pval" in dist_diagnostics:
is_normal = (
"✅ Yes"
f.write(
f"| Normal? (Shapiro-Wilk) | {is_normal} (p={dist_diagnostics[f'{col}_shapiro_pval']:.4e}) |\n"
)
+ # Anderson-Darling diagnostics
+ if f"{col}_anderson_stat" in dist_diagnostics:
+ f.write(
+ f"| Anderson-Darling stat | {dist_diagnostics[f'{col}_anderson_stat']:.4f} |\n"
+ )
+ f.write(
+ f"| Anderson 5% critical | {dist_diagnostics[f'{col}_anderson_critical_5pct']:.4f} |\n"
+ )
+ is_normal_anderson = (
+ "✅ Yes"
+ if dist_diagnostics.get(f"{col}_is_normal_anderson", False)
+ else "❌ No"
+ )
+ f.write(f"| Normal? (Anderson-Darling) | {is_normal_anderson} |\n")
if f"{col}_qq_r_squared" in dist_diagnostics:
f.write(
f"| Q-Q Plot R² | {dist_diagnostics[f'{col}_qq_r_squared']:.4f} |\n"
if distribution_shift:
f.write("### 5.4 Distribution Shift Analysis\n\n")
f.write("Comparison between synthetic and real data distributions:\n\n")
- f.write(
- "| Feature | KL Div | JS Dist | Wasserstein | KS Stat | KS p-value |\n"
- )
- f.write(
- "|---------|--------|---------|-------------|---------|------------|\n"
- )
+ f.write("| Feature | KL Div | JS Dist | Wasserstein | KS Stat | KS p-value |\n")
+ f.write("|---------|--------|---------|-------------|---------|------------|\n")
features = ["pnl", "trade_duration", "idle_duration"]
for feature in features:
- kl = distribution_shift.get(
- f"{feature}_kl_divergence", float("nan")
- )
- js = distribution_shift.get(f"{feature}_js_distance", float("nan"))
- ws = distribution_shift.get(f"{feature}_wasserstein", float("nan"))
- ks_stat = distribution_shift.get(
- f"{feature}_ks_statistic", float("nan")
- )
- ks_p = distribution_shift.get(f"{feature}_ks_pvalue", float("nan"))
+ kl = distribution_shift.get(f"{feature}_kl_divergence", np.nan)
+ js = distribution_shift.get(f"{feature}_js_distance", np.nan)
+ ws = distribution_shift.get(f"{feature}_wasserstein", np.nan)
+ ks_stat = distribution_shift.get(f"{feature}_ks_statistic", np.nan)
+ ks_p = distribution_shift.get(f"{feature}_ks_pvalue", np.nan)
f.write(
f"| {feature} | {kl:.4f} | {js:.4f} | {ws:.4f} | {ks_stat:.4f} | {ks_p:.4g} |\n"
f.write("|--------|-----------|--------|\n")
f.write("| KL Divergence | < 0.3 | ✅ Yes: Good representativeness |\n")
f.write("| JS Distance | < 0.2 | ✅ Yes: Similar distributions |\n")
- f.write(
- "| KS p-value | > 0.05 | ✅ Yes: No significant difference |\n\n"
- )
+ f.write("| KS p-value | > 0.05 | ✅ Yes: No significant difference |\n\n")
else:
# Placeholder keeps numbering stable and explicit
f.write("### 5.4 Distribution Shift Analysis\n\n")
f.write(
"1. **Global Statistics** - Overall reward distributions and component activation\n"
)
- f.write(
- "2. **Sample Representativity** - Coverage of critical market scenarios\n"
- )
+ f.write("2. **Sample Representativity** - Coverage of critical market scenarios\n")
f.write(
"3. **Component Analysis** - Relationships between rewards and conditions (including PBRS)\n"
)
"4. **Feature Importance** - (skipped) Machine learning analysis of key drivers\n"
)
else:
- f.write(
- "4. **Feature Importance** - Machine learning analysis of key drivers\n"
- )
- f.write(
- "5. **Statistical Validation** - Hypothesis tests and confidence intervals\n"
- )
+ f.write("4. **Feature Importance** - Machine learning analysis of key drivers\n")
+ f.write("5. **Statistical Validation** - Hypothesis tests and confidence intervals\n")
if distribution_shift:
f.write("6. **Distribution Shift** - Comparison with real trading data\n")
else:
- f.write(
- "6. **Distribution Shift** - Not performed (no real episodes provided)\n"
- )
+ f.write("6. **Distribution Shift** - Not performed (no real episodes provided)\n")
if "reward_shaping" in df.columns:
_total_shaping = df["reward_shaping"].sum()
_canonical = abs(_total_shaping) < PBRS_INVARIANCE_TOL
f.write("**Generated Files:**\n")
f.write("- `reward_samples.csv` - Raw synthetic samples\n")
if not skip_feature_analysis and len(df) >= 4:
- f.write(
- "- `feature_importance.csv` - Complete feature importance rankings\n"
- )
- f.write(
- "- `partial_dependence_*.csv` - Partial dependence data for visualization\n"
- )
+ f.write("- `feature_importance.csv` - Complete feature importance rankings\n")
+ f.write("- `partial_dependence_*.csv` - Partial dependence data for visualization\n")
def main() -> None:
# Then apply --params KEY=VALUE overrides (highest precedence)
params.update(parse_overrides(args.params))
- # Early parameter validation (moved before simulation for alignment with docs)
- params_validated, adjustments = validate_reward_parameters(params)
+ params_validated, adjustments = validate_reward_parameters(
+ params, strict=args.strict_validation
+ )
params = params_validated
if adjustments:
# Compact adjustments summary (param: original->adjusted [reason])
base_factor = _get_float_param(params, "base_factor", float(args.base_factor))
profit_target = _get_float_param(params, "profit_target", float(args.profit_target))
- risk_reward_ratio = _get_float_param(
- params, "risk_reward_ratio", float(args.risk_reward_ratio)
- )
+ risk_reward_ratio = _get_float_param(params, "risk_reward_ratio", float(args.risk_reward_ratio))
cli_action_masking = _to_bool(args.action_masking)
if "action_masking" in params:
params["action_masking"] = _to_bool(params["action_masking"])
else:
params["action_masking"] = cli_action_masking
+ params["unrealized_pnl"] = bool(getattr(args, "unrealized_pnl", False))
+ # Propagate strict flag into params for downstream runtime guards
+ params["strict_validation"] = bool(getattr(args, "strict_validation", True))
# Deterministic seeds cascade
random.seed(args.seed)
num_samples=args.num_samples,
seed=args.seed,
params=params,
- max_trade_duration=args.max_trade_duration,
base_factor=base_factor,
profit_target=profit_target,
risk_reward_ratio=risk_reward_ratio,
"idle_duration",
"position",
"action",
- "reward_total",
+ "reward",
"reward_invalid",
"reward_idle",
"reward_hold",
"reward_exit",
]
nan_issues = {
- c: int(df[c].isna().sum())
- for c in critical_cols
- if c in df.columns and df[c].isna().any()
+ c: int(df[c].isna().sum()) for c in critical_cols if c in df.columns and df[c].isna().any()
}
if nan_issues:
raise AssertionError(
+ ", ".join(f"{k}={v}" for k, v in nan_issues.items())
)
# Attach simulation parameters for downstream manifest
- df.attrs["simulation_params"] = {
- "num_samples": args.num_samples,
- "seed": args.seed,
- "max_trade_duration": args.max_trade_duration,
- "base_factor": base_factor,
- "profit_target": profit_target,
- "risk_reward_ratio": risk_reward_ratio,
- "max_duration_ratio": args.max_duration_ratio,
- "trading_mode": args.trading_mode,
- "action_masking": _get_bool_param(params, "action_masking", True),
- "pnl_base_std": args.pnl_base_std,
- "pnl_duration_vol_scale": args.pnl_duration_vol_scale,
- }
- # Attach resolved reward parameters for inline overrides rendering in report
+ try:
+ defaults = {
+ a.dest: getattr(a, "default", None) for a in parser._actions if hasattr(a, "dest")
+ }
+ except Exception:
+ defaults = {}
+ args_dict = vars(args)
+
+ candidate_keys = [
+ "num_samples",
+ "seed",
+ "out_dir",
+ "trading_mode",
+ "risk_reward_ratio",
+ "profit_target",
+ "max_duration_ratio",
+ "pnl_base_std",
+ "pnl_duration_vol_scale",
+ "rf_n_jobs",
+ "perm_n_jobs",
+ "skip_feature_analysis",
+ "skip_partial_dependence",
+ "stats_seed",
+ "strict_diagnostics",
+ "bootstrap_resamples",
+ "pvalue_adjust",
+ "real_episodes",
+ "unrealized_pnl",
+ "action_masking",
+ ]
+
+ sim_params: Dict[str, Any] = {}
+ for k in candidate_keys:
+ if k in args_dict:
+ v = args_dict[k]
+ v_norm = str(v) if isinstance(v, Path) else v
+ d = defaults.get(k)
+ d_norm = str(d) if isinstance(d, Path) else d
+ if d_norm != v_norm:
+ sim_params[k] = v_norm
+
+ # Deduplicate any keys that overlap with reward_params (single source of truth)
+ for k in list(sim_params.keys()):
+ if k in params:
+ sim_params.pop(k)
+
+ df.attrs["simulation_params"] = sim_params
df.attrs["reward_params"] = dict(params)
args.out_dir.mkdir(parents=True, exist_ok=True)
write_complete_statistical_analysis(
df,
args.out_dir,
- max_trade_duration=args.max_trade_duration,
profit_target=float(profit_target * risk_reward_ratio),
seed=args.seed,
real_df=real_df,
adjust_method=args.pvalue_adjust,
- stats_seed=(
- args.stats_seed if getattr(args, "stats_seed", None) is not None else None
- ),
+ stats_seed=(args.stats_seed if getattr(args, "stats_seed", None) is not None else None),
strict_diagnostics=bool(getattr(args, "strict_diagnostics", False)),
bootstrap_resamples=getattr(args, "bootstrap_resamples", 10000),
skip_partial_dependence=bool(getattr(args, "skip_partial_dependence", False)),
skip_feature_analysis=bool(getattr(args, "skip_feature_analysis", False)),
+ rf_n_jobs=int(getattr(args, "rf_n_jobs", -1)),
+ perm_n_jobs=int(getattr(args, "perm_n_jobs", -1)),
)
- print(
- f"Complete statistical analysis saved to: {args.out_dir / 'statistical_analysis.md'}"
- )
+ print(f"Complete statistical analysis saved to: {args.out_dir / 'statistical_analysis.md'}")
# Generate manifest summarizing key metrics
try:
manifest_path = args.out_dir / "manifest.json"
- resolved_reward_params = dict(params) # already validated/normalized upstream
- manifest = {
+ resolved_reward_params: Dict[str, Any] = dict(
+ params
+ ) # already validated/normalized upstream
+ manifest: Dict[str, Any] = {
"generated_at": pd.Timestamp.now().isoformat(),
"num_samples": int(len(df)),
"seed": int(args.seed),
- "max_trade_duration": int(args.max_trade_duration),
"profit_target_effective": float(profit_target * risk_reward_ratio),
"pvalue_adjust_method": args.pvalue_adjust,
"parameter_adjustments": adjustments,
"reward_params": resolved_reward_params,
}
- sim_params = df.attrs.get("simulation_params", {})
- if isinstance(sim_params, dict) and sim_params:
- import hashlib as _hashlib
- import json as _json
-
- # Compose hash source from ALL simulation params and ALL resolved reward params for full reproducibility.
- _hash_source = {
- **{f"sim::{k}": sim_params[k] for k in sorted(sim_params)},
+ sim_params_dict = df.attrs.get("simulation_params", {})
+ if not isinstance(sim_params_dict, dict):
+ sim_params_dict = {}
+ sim_params: Dict[str, Any] = dict(sim_params_dict)
+ if sim_params:
+ excluded_for_hash = {"out_dir", "real_episodes"}
+ sim_params_for_hash: Dict[str, Any] = {
+ k: sim_params[k] for k in sim_params if k not in excluded_for_hash
+ }
+ _hash_source: Dict[str, Any] = {
+ **{f"sim::{k}": sim_params_for_hash[k] for k in sorted(sim_params_for_hash)},
**{
f"reward::{k}": resolved_reward_params[k]
for k in sorted(resolved_reward_params)
},
}
- serialized = _json.dumps(_hash_source, sort_keys=True)
- manifest["params_hash"] = _hashlib.sha256(
- serialized.encode("utf-8")
- ).hexdigest()
+ _hash_source_str = json.dumps(_hash_source, sort_keys=True)
+ manifest["params_hash"] = hashlib.sha256(_hash_source_str.encode("utf-8")).hexdigest()
manifest["simulation_params"] = sim_params
with manifest_path.open("w", encoding="utf-8") as mh:
- import json as _json
-
- _json.dump(manifest, mh, indent=2)
+ json.dump(manifest, mh, indent=2)
print(f"Manifest written to: {manifest_path}")
except Exception as e:
print(f"Manifest generation failed: {e}")
-"""CLI integration smoke test for reward_space_analysis.
+"""CLI integration test for reward_space_analysis.
Purpose
-------
-Execute a bounded, optionally shuffled subset of parameter combinations for
-`reward_space_analysis.py` to verify end-to-end execution (smoke / regression
-signal, not correctness proof).
+Execute a bounded, optionally shuffled subset of parameter combinations for `reward_space_analysis.py` to verify end-to-end execution.
Key features
------------
-* Deterministic sampling with optional shuffling (`--shuffle-seed`).
-* Optional duplication of first N scenarios under strict diagnostics
- (`--strict-sample`).
-* Per-scenario timing and aggregate statistics (mean / min / max seconds).
-* Simple warning counting + (patch adds) breakdown of distinct warning lines.
-* Scenario list + seed metadata exported for reproducibility.
-* Direct CLI forwarding of bootstrap resample count to child process.
+* Deterministic sampling with optional shuffling (`--shuffle_seed`).
+* Optional duplication of first N scenarios under strict diagnostics (`--strict_sample`).
+* Per-scenario timing and aggregate statistics (mean / min / max / median / p95 seconds).
+* Warning counting based on header lines plus a breakdown of distinct warning headers.
+* Log tail truncation controlled via `--tail_chars` (characters) or full logs via `--full_logs`.
+* Direct CLI forwarding of bootstrap resample count to the child process.
Usage
-----
-python test_cli.py --samples 50 --out_dir ../sample_run_output_smoke \
+python test_cli.py --num_samples 50 --out_dir ../sample_run_output \
--shuffle_seed 123 --strict_sample 3 --bootstrap_resamples 200
JSON Summary fields
-------------------
-total, ok, failures[], warnings_total, warnings_breakdown, mean_seconds,
-max_seconds, min_seconds, strict_duplicated, scenarios (list), seeds (metadata).
-
+- total, successes[], failures[]
+- mean_seconds, max_seconds, min_seconds, median_seconds, p95_seconds
+- warnings_breakdown
+- seeds (sampling/configuration seeds)
+- metadata (timestamp_utc, python_version, platform, git_commit, schema_version=2, per_scenario_timeout)
+- interrupted (optional)
Exit codes
----------
0: success, 1: failures present, 130: interrupted (partial summary written).
import argparse
import itertools
import json
+import math
import os
import platform
import random
+import re
+import statistics
import subprocess
import sys
import tempfile
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, TypedDict
-ConfigTuple = Tuple[str, str, float, int, int, int]
+try:
+ from typing import NotRequired, Required # Python >=3.11
+except ImportError:
+ from typing_extensions import NotRequired, Required # Python <3.11
+ConfigTuple = Tuple[str, str, float, int, int, int]
-SUMMARY_FILENAME = "reward_space_cli_smoke_results.json"
+SUMMARY_FILENAME = "reward_space_cli_results.json"
class ScenarioResult(TypedDict):
warnings: int
-class SummaryResult(TypedDict):
- total: int
- ok: int
- failures: List[ScenarioResult]
- warnings_total: int
- mean_seconds: Optional[float]
- max_seconds: Optional[float]
- min_seconds: Optional[float]
- strict_duplicated: int
+class SummaryResult(TypedDict, total=False):
+ # Required keys
+ total: Required[int]
+ successes: Required[List[ScenarioResult]]
+ failures: Required[List[ScenarioResult]]
+ mean_seconds: Required[Optional[float]]
+ max_seconds: Required[Optional[float]]
+ min_seconds: Required[Optional[float]]
+ median_seconds: Required[Optional[float]]
+ p95_seconds: Required[Optional[float]]
+
+ # Extension keys
+ warnings_breakdown: NotRequired[Dict[str, int]]
+ seeds: NotRequired[Dict[str, Any]]
+ metadata: NotRequired[Dict[str, Any]]
+ interrupted: NotRequired[bool]
+
+
+_WARN_HEADER_RE = re.compile(r"^\s*(?:[A-Za-z]+Warning|WARNING)\b:?", re.IGNORECASE)
+
+
+def _is_warning_header(line: str) -> bool:
+ l = line.strip()
+ if not l:
+ return False
+ if "warnings.warn" in l.lower():
+ return False
+ return bool(_WARN_HEADER_RE.search(l))
def build_arg_matrix(
)
full: List[ConfigTuple] = list(product_iter)
+ full = [c for c in full if not (c[0] == "canonical" and (c[4] == 1 or c[5] == 1))]
if shuffle_seed is not None:
rnd = random.Random(shuffle_seed)
rnd.shuffle(full)
if max_scenarios >= len(full):
return full
step = len(full) / max_scenarios
- idx_pos = 0.0
+ idx_pos = step / 2.0 # Centered sampling
selected: List[ConfigTuple] = []
+ selected_indices: set[int] = set()
for _ in range(max_scenarios):
- idx = int(idx_pos)
- if idx >= len(full):
+ idx = int(round(idx_pos))
+ if idx < 0:
+ idx = 0
+ elif idx >= len(full):
idx = len(full) - 1
+ if idx in selected_indices:
+ left = idx - 1
+ right = idx + 1
+ while True:
+ if left >= 0 and left not in selected_indices:
+ idx = left
+ break
+ if right < len(full) and right not in selected_indices:
+ idx = right
+ break
+ left -= 1
+ right += 1
+ if left < 0 and right >= len(full):
+ # All indices taken; fallback to current idx
+ break
selected.append(full[idx])
+ selected_indices.add(idx)
idx_pos += step
return selected
script: Path,
out_dir: Path,
idx: int,
- total: int,
- base_samples: int,
+ num_samples: int,
conf: ConfigTuple,
strict: bool,
bootstrap_resamples: int,
timeout: int,
skip_feature_analysis: bool = False,
+ skip_partial_dependence: bool = False,
+ unrealized_pnl: bool = False,
+ full_logs: bool = False,
+ params: Optional[List[str]] = None,
+ tail_chars: int = 5000,
) -> ScenarioResult:
(
exit_potential_mode,
sys.executable,
str(script),
"--num_samples",
- str(base_samples),
+ str(num_samples),
"--out_dir",
str(scenario_dir),
"--exit_potential_mode",
cmd += ["--bootstrap_resamples", str(bootstrap_resamples)]
if skip_feature_analysis:
cmd.append("--skip_feature_analysis")
+ if skip_partial_dependence:
+ cmd.append("--skip_partial_dependence")
+ if unrealized_pnl:
+ cmd.append("--unrealized_pnl")
if strict:
cmd.append("--strict_diagnostics")
+ if params:
+ cmd += ["--params"] + list(params)
start = time.perf_counter()
try:
- proc = subprocess.run(
- cmd, capture_output=True, text=True, check=False, timeout=timeout
- )
+ proc = subprocess.run(cmd, capture_output=True, text=True, check=False, timeout=timeout)
except subprocess.TimeoutExpired:
return {
"config": conf,
}
status = "ok" if proc.returncode == 0 else f"error({proc.returncode})"
end = time.perf_counter()
- combined = (proc.stdout + "\n" + proc.stderr).lower()
- warn_count = combined.count("warning")
+ if proc.returncode != 0:
+ cmd_str = " ".join(cmd)
+ stderr_head_lines = proc.stderr.splitlines()[:3]
+ stderr_head = "\n".join(stderr_head_lines)
+ print(f"[error details] command: {cmd_str}")
+ if stderr_head:
+ print(f"[error details] stderr head:\n{stderr_head}")
+ else:
+ print("[error details] stderr is empty.")
+ combined = proc.stdout.splitlines() + proc.stderr.splitlines()
+ warnings = sum(1 for line in combined if _is_warning_header(line))
+ if full_logs:
+ stdout_out = proc.stdout
+ stderr_out = proc.stderr
+ else:
+ if tail_chars == 0:
+ stdout_out = ""
+ stderr_out = ""
+ else:
+ stdout_out = proc.stdout[-tail_chars:]
+ stderr_out = proc.stderr[-tail_chars:]
return {
"config": conf,
"status": status,
- "stdout": proc.stdout[-5000:],
- "stderr": proc.stderr[-5000:],
+ "stdout": stdout_out,
+ "stderr": stderr_out,
"strict": strict,
"seconds": round(end - start, 4),
- "warnings": warn_count,
+ "warnings": warnings,
}
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
- "--samples",
+ "--num_samples",
type=int,
default=40,
- help="num synthetic samples per scenario (minimum 4 for feature analysis)",
+ help="Number of synthetic samples per scenario (minimum 4 for feature analysis)",
)
parser.add_argument(
"--skip_feature_analysis",
action="store_true",
- help="Skip feature importance and model-based analysis for all scenarios.",
+ help="Forward --skip_feature_analysis to child process to skip feature importance and model-based analysis for all scenarios.",
+ )
+ parser.add_argument(
+ "--skip_partial_dependence",
+ action="store_true",
+ help="Forward --skip_partial_dependence to child process to skip partial dependence computation.",
)
parser.add_argument(
"--out_dir",
type=str,
- default="sample_run_output_smoke",
- help="output parent directory",
+ default="sample_run_output",
+ help="Output parent directory",
)
parser.add_argument(
"--shuffle_seed",
help="Timeout (seconds) per child process (default: 600)",
)
parser.add_argument(
- "--store_full_logs",
+ "--full_logs",
action="store_true",
help="If set, store full stdout/stderr (may be large) instead of tail truncation.",
)
+ parser.add_argument(
+ "--unrealized_pnl",
+ action="store_true",
+ help="Forward --unrealized_pnl to child process to exercise hold Φ(s) path.",
+ )
+ parser.add_argument(
+ "--params",
+ nargs="*",
+ default=[],
+ metavar="KEY=VALUE",
+ help="Forward parameter overrides to child process via --params, e.g. action_masking=0",
+ )
+ parser.add_argument(
+ "--tail_chars",
+ type=int,
+ default=5000,
+ help="Characters to keep from stdout/stderr tail when not storing full logs.",
+ )
args = parser.parse_args()
# Basic validation
if args.max_scenarios <= 0:
parser.error("--max_scenarios must be > 0")
- if args.samples < 4 and not args.skip_feature_analysis:
- parser.error("--samples must be >= 4 unless --skip_feature_analysis is set")
+ if args.num_samples < 4 and not args.skip_feature_analysis:
+ parser.error("--num_samples must be >= 4 unless --skip_feature_analysis is set")
if args.strict_sample < 0:
parser.error("--strict_sample must be >= 0")
if args.bootstrap_resamples <= 0:
parser.error("--bootstrap_resamples must be > 0")
+ if args.tail_chars < 0:
+ parser.error("--tail_chars must be >= 0")
+ if args.per_scenario_timeout <= 0:
+ parser.error("--per_scenario_timeout must be > 0")
script = Path(__file__).parent / "reward_space_analysis.py"
out_dir = Path(args.out_dir)
out_dir.mkdir(parents=True, exist_ok=True)
- scenarios = build_arg_matrix(
- max_scenarios=args.max_scenarios, shuffle_seed=args.shuffle_seed
- )
-
- # Prepare list of (conf, strict_flag)
+ scenarios = build_arg_matrix(max_scenarios=args.max_scenarios, shuffle_seed=args.shuffle_seed)
+
+ # Validate --params basic KEY=VALUE format
+ valid_params: List[str] = []
+ invalid_params: List[str] = []
+ for p in args.params:
+ if "=" in p:
+ valid_params.append(p)
+ else:
+ invalid_params.append(p)
+ if invalid_params:
+ msg = f"Warning: ignoring malformed --params entries: {invalid_params}"
+ print(msg, file=sys.stderr)
+ print(f"{msg}")
+ args.params = valid_params
+
+ # Prepare list of (conf, strict)
scenario_pairs: List[Tuple[ConfigTuple, bool]] = [(c, False) for c in scenarios]
- strict_n = max(0, min(args.strict_sample, len(scenarios)))
- for c in scenarios[:strict_n]:
+ indices = {conf: idx for idx, conf in enumerate(scenarios, start=1)}
+ n_duplicated = max(0, min(args.strict_sample, len(scenarios)))
+ if n_duplicated > 0:
+ print(f"Duplicating first {n_duplicated} scenarios with --strict_diagnostics")
+ for c in scenarios[:n_duplicated]:
scenario_pairs.append((c, True))
results: List[ScenarioResult] = []
total = len(scenario_pairs)
interrupted = False
try:
- for i, (conf, strict_flag) in enumerate(scenario_pairs, start=1):
- # Ensure child process sees the chosen bootstrap resamples via direct CLI args only
+ for i, (conf, strict) in enumerate(scenario_pairs, start=1):
res = run_scenario(
- script,
- out_dir,
- i,
- total,
- args.samples,
- conf,
- strict=strict_flag,
+ script=script,
+ out_dir=out_dir,
+ idx=i,
+ num_samples=args.num_samples,
+ conf=conf,
+ strict=strict,
bootstrap_resamples=args.bootstrap_resamples,
timeout=args.per_scenario_timeout,
skip_feature_analysis=args.skip_feature_analysis,
+ skip_partial_dependence=args.skip_partial_dependence,
+ unrealized_pnl=args.unrealized_pnl,
+ full_logs=args.full_logs,
+ params=args.params,
+ tail_chars=args.tail_chars,
)
results.append(res)
status = res["status"]
- tag = "[strict]" if strict_flag else ""
+ strict_str = f"[strict duplicate_of={indices.get(conf, '?')}]" if strict else ""
secs = res.get("seconds")
secs_str = f" {secs:.2f}s" if secs is not None else ""
- print(f"[{i}/{total}] {conf} {tag} -> {status}{secs_str}")
+ print(f"[{i}/{total}] {conf} {strict_str} -> {status}{secs_str}")
except KeyboardInterrupt:
interrupted = True
print("\nKeyboardInterrupt received: writing partial summary...")
- ok = sum(1 for r in results if r["status"] == "ok")
+ successes = [r for r in results if r["status"] == "ok"]
failures = [r for r in results if r["status"] != "ok"]
- total_warnings = sum(r["warnings"] for r in results)
durations: List[float] = [
float(r["seconds"]) for r in results if isinstance(r["seconds"], float)
]
+ if durations:
+ _sorted = sorted(durations)
+ median_seconds = statistics.median(_sorted)
+ n = len(_sorted)
+ if n == 1:
+ p95_seconds = _sorted[0]
+ else:
+ pos = 0.95 * (n - 1)
+ i0 = int(math.floor(pos))
+ i1 = int(math.ceil(pos))
+ if i0 == i1:
+ p95_seconds = _sorted[i0]
+ else:
+ w = pos - i0
+ p95_seconds = _sorted[i0] + (_sorted[i1] - _sorted[i0]) * w
+ else:
+ median_seconds = None
+ p95_seconds = None
summary: SummaryResult = {
"total": len(results),
- "ok": ok,
+ "successes": successes,
"failures": failures,
- "warnings_total": total_warnings,
- "mean_seconds": round(sum(durations) / len(durations), 4)
- if durations
- else None,
+ "mean_seconds": round(sum(durations) / len(durations), 4) if durations else None,
"max_seconds": max(durations) if durations else None,
"min_seconds": min(durations) if durations else None,
- "strict_duplicated": strict_n,
+ "median_seconds": median_seconds,
+ "p95_seconds": p95_seconds,
}
- # Build warning breakdown (simple line fingerprinting)
- warning_counts: Dict[str, int] = {}
+ # Build warnings breakdown
+ warnings_breakdown: Dict[str, int] = {}
for r in results:
text = (r["stderr"] + "\n" + r["stdout"]).splitlines()
for line in text:
- if "warning" in line.lower():
- # Fingerprint: trim + collapse whitespace + limit length
+ if _is_warning_header(line):
fp = " ".join(line.strip().split())[:160]
- warning_counts[fp] = warning_counts.get(fp, 0) + 1
-
- # Scenario export (list of configs only, excluding strict flag duplication detail)
- scenario_list = [list(c) for c, _ in scenario_pairs]
+ warnings_breakdown[fp] = warnings_breakdown.get(fp, 0) + 1
# Collect environment + reproducibility metadata
def _git_hash() -> Optional[str]:
return None
return None
- summary_extra: Dict[str, Any] = {
- "warnings_breakdown": warning_counts,
- "scenarios": scenario_list,
- "seeds": {
- "shuffle_seed": args.shuffle_seed,
- "strict_sample": args.strict_sample,
- "max_scenarios": args.max_scenarios,
- "bootstrap_resamples": args.bootstrap_resamples,
- },
- "metadata": {
- "timestamp_utc": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
- "python_version": sys.version.split()[0],
- "platform": platform.platform(),
- "git_commit": _git_hash(),
- "schema_version": 1,
- "per_scenario_timeout": args.per_scenario_timeout,
- },
- }
- serializable: Dict[str, Any]
+ summary.update(
+ {
+ "warnings_breakdown": warnings_breakdown,
+ "seeds": {
+ "shuffle_seed": args.shuffle_seed,
+ "strict_sample": args.strict_sample,
+ "max_scenarios": args.max_scenarios,
+ "bootstrap_resamples": args.bootstrap_resamples,
+ },
+ "metadata": {
+ "timestamp_utc": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
+ "python_version": sys.version.split()[0],
+ "platform": platform.platform(),
+ "git_commit": _git_hash(),
+ "schema_version": 2,
+ "per_scenario_timeout": args.per_scenario_timeout,
+ },
+ }
+ )
if interrupted:
- serializable = {**summary, **summary_extra, "interrupted": True}
- else:
- serializable = {**summary, **summary_extra}
+ summary["interrupted"] = True
# Atomic write to avoid corrupt partial files
tmp_fd, tmp_path = tempfile.mkstemp(prefix="_tmp_summary_", dir=str(out_dir))
try:
with os.fdopen(tmp_fd, "w", encoding="utf-8") as fh:
- json.dump(serializable, fh, indent=2)
+ json.dump(summary, fh, indent=2)
os.replace(tmp_path, out_dir / SUMMARY_FILENAME)
except Exception:
# Best effort fallback
try:
Path(out_dir / SUMMARY_FILENAME).write_text(
- json.dumps(serializable, indent=2), encoding="utf-8"
+ json.dumps(summary, indent=2), encoding="utf-8"
)
finally:
if os.path.exists(tmp_path):
except OSError:
pass
else:
- if os.path.exists(tmp_path): # Should have been moved; defensive cleanup
+ # Defensive cleanup: remove temp file if atomic replace did not clean up
+ if os.path.exists(tmp_path):
try:
os.remove(tmp_path)
except OSError:
import unittest
import warnings
from pathlib import Path
-from typing import Iterable, Optional, Sequence, Union
+from typing import Any, Dict, Iterable, Optional, Sequence, Union
import numpy as np
import pandas as pd
PBRS_SWEEP_ITER = 120
# Generic numeric tolerances (distinct from PBRS structural constants)
- EPS_BASE = (
- 1e-12 # Base epsilon for strict identity & numeric guards (single source)
- )
+ EPS_BASE = 1e-12 # Base epsilon for strict identity & numeric guards (single source)
TOL_NUMERIC_GUARD = EPS_BASE # Division-by-zero guards / min denominators (alias)
TOL_IDENTITY_STRICT = EPS_BASE # Strict component identity (alias of EPS_BASE)
TOL_IDENTITY_RELAXED = 1e-9 # Looser identity when cumulative fp drift acceptable
TOL_GENERIC_EQ = 1e-6 # Generic numeric equality
TOL_NEGLIGIBLE = 1e-8 # Negligible statistical or shaping effects
- MIN_EXIT_POWER_TAU = (
- 1e-6 # Lower bound for exit_power_tau parameter (validation semantics)
- )
+ MIN_EXIT_POWER_TAU = 1e-6 # Lower bound for exit_power_tau parameter (validation semantics)
# Distribution shape invariance (skewness / excess kurtosis) tolerance under scaling
TOL_DISTRIB_SHAPE = 5e-2
# Theoretical upper bound for Jensen-Shannon distance: sqrt(log 2)
pnl: float = 0.0,
trade_duration: int = 0,
idle_duration: int = 0,
- max_trade_duration: int = 100,
max_unrealized_profit: float = 0.0,
min_unrealized_profit: float = 0.0,
position: Positions = Positions.Neutral,
pnl=pnl,
trade_duration=trade_duration,
idle_duration=idle_duration,
- max_trade_duration=max_trade_duration,
max_unrealized_profit=max_unrealized_profit,
min_unrealized_profit=min_unrealized_profit,
position=position,
action=action,
)
- def base_params(self, **overrides) -> dict:
+ def base_params(self, **overrides) -> Dict[str, Any]:
"""Return fresh copy of default reward params with overrides."""
- params = DEFAULT_MODEL_REWARD_PARAMETERS.copy()
+ params: Dict[str, Any] = DEFAULT_MODEL_REWARD_PARAMETERS.copy()
params.update(overrides)
return params
self,
params: dict,
*,
- iterations: int | None = None,
- terminal_prob: float | None = None,
+ iterations: Optional[int] = None,
+ terminal_prob: Optional[float] = None,
seed: int = 123,
) -> tuple[list[float], list[float]]:
"""Run a lightweight canonical invariance sweep.
current_pnl = 0.0
current_dur = 0.0
for _ in range(iters):
- is_terminal = rng.uniform() < term_p
- next_pnl = 0.0 if is_terminal else float(rng.normal(0, 0.2))
+ is_exit = rng.uniform() < term_p
+ next_pnl = 0.0 if is_exit else float(rng.normal(0, 0.2))
inc = rng.uniform(0, 0.12)
- next_dur = 0.0 if is_terminal else float(min(1.0, current_dur + inc))
+ next_dur = 0.0 if is_exit else float(min(1.0, current_dur + inc))
_tot, shap_val, next_pot = apply_potential_shaping(
base_reward=0.0,
current_pnl=current_pnl,
current_duration_ratio=current_dur,
next_pnl=next_pnl,
next_duration_ratio=next_dur,
- is_terminal=is_terminal,
+ is_exit=is_exit,
+ is_entry=False, # In _canonical_sweep, we do not simulate entries
last_potential=last_potential,
params=params,
)
shaping_vals.append(shap_val)
- if is_terminal:
+ if is_exit:
terminal_next.append(next_pot)
last_potential = 0.0
current_pnl = 0.0
self,
*,
n: int,
- reward_total_mean: float = 0.0,
- reward_total_std: float = 1.0,
+ reward_mean: float = 0.0,
+ reward_std: float = 1.0,
pnl_mean: float = 0.01,
- pnl_std: float | None = None,
+ pnl_std: Optional[float] = None,
trade_duration_dist: str = "uniform",
idle_pattern: str = "mixed",
- seed: int | None = None,
+ seed: Optional[int] = None,
) -> pd.DataFrame:
"""Generate a synthetic statistical DataFrame.
----------
n : int
Row count.
- reward_total_mean, reward_total_std : float
- Normal parameters for reward_total.
+ reward_mean, reward_std : float
+ Normal parameters for reward.
pnl_mean : float
Mean PnL.
pnl_std : float | None
Returns
-------
- pd.DataFrame with columns: reward_total, reward_idle, reward_hold, reward_exit,
+ pd.DataFrame with columns: reward, reward_idle, reward_hold, reward_exit,
pnl, trade_duration, idle_duration, position. Guarantees: no NaN; reward_idle==0 where idle_duration==0.
"""
if seed is not None:
self.seed_all(seed)
pnl_std_eff = self.TEST_PNL_STD if pnl_std is None else pnl_std
- reward_total = np.random.normal(reward_total_mean, reward_total_std, n)
+ reward = np.random.normal(reward_mean, reward_std, n)
pnl = np.random.normal(pnl_mean, pnl_std_eff, n)
if trade_duration_dist == "exponential":
trade_duration = np.random.exponential(20, n)
position = np.random.choice([0.0, 0.5, 1.0], n)
return pd.DataFrame(
{
- "reward_total": reward_total,
+ "reward": reward,
"reward_idle": reward_idle,
"reward_hold": reward_hold,
"reward_exit": reward_exit,
a: Union[float, int],
b: Union[float, int],
places: int,
- msg: str | None = None,
+ msg: Optional[str] = None,
) -> None:
"""Bridge for legacy places-based approximate equality.
data = list(seq)
if len(data) < 2:
return
- if (non_increasing and non_decreasing) or (
- not non_increasing and not non_decreasing
- ):
+ if (non_increasing and non_decreasing) or (not non_increasing and not non_decreasing):
self.fail("Specify exactly one monotonic direction")
for a, b in zip(data, data[1:]):
if non_increasing:
np.random.seed(seed)
random.seed(seed)
- # Shared helper data generators (moved here for subclass availability)
+ # Shared helper data generators available to subclasses.
def _const_df(self, n: int = 64) -> pd.DataFrame:
return pd.DataFrame(
{
- "reward_total": np.ones(n) * 0.5,
+ "reward": np.ones(n) * 0.5,
"pnl": np.zeros(n),
"trade_duration": np.ones(n) * 10,
"idle_duration": np.ones(n) * 3,
}
)
- def _shift_scale_df(
- self, n: int = 256, shift: float = 0.0, scale: float = 1.0
- ) -> pd.DataFrame:
+ def _shift_scale_df(self, n: int = 256, shift: float = 0.0, scale: float = 1.0) -> pd.DataFrame:
rng = np.random.default_rng(123)
base = rng.normal(0, 1, n)
return pd.DataFrame(
{
- "reward_total": shift + scale * base,
+ "reward": shift + scale * base,
"pnl": shift + scale * base * 0.2,
"trade_duration": rng.exponential(20, n),
"idle_duration": rng.exponential(10, n),
str(self.output_path),
]
- result = subprocess.run(
- cmd, capture_output=True, text=True, cwd=Path(__file__).parent
- )
+ result = subprocess.run(cmd, capture_output=True, text=True, cwd=Path(__file__).parent)
# Exit 0
self.assertEqual(result.returncode, 0, f"CLI failed: {result.stderr}")
]
# Execute both runs
- result1 = subprocess.run(
- cmd1, capture_output=True, text=True, cwd=Path(__file__).parent
- )
- result2 = subprocess.run(
- cmd2, capture_output=True, text=True, cwd=Path(__file__).parent
- )
+ result1 = subprocess.run(cmd1, capture_output=True, text=True, cwd=Path(__file__).parent)
+ result2 = subprocess.run(cmd2, capture_output=True, text=True, cwd=Path(__file__).parent)
self.assertEqual(result1.returncode, 0)
self.assertEqual(result2.returncode, 0)
"idle_duration": idle_duration,
"reward_idle": reward_idle,
"position": np.random.choice([0.0, 0.5, 1.0], n),
- "reward_total": np.random.normal(0, 1, n),
+ "reward": np.random.normal(0, 1, n),
"pnl": np.random.normal(0, self.TEST_PNL_STD, n),
"trade_duration": np.random.exponential(20, n),
}
df2 = self._make_idle_variance_df(100)
# Shift second dataset
- df2["reward_total"] += 0.1
+ df2["reward"] += 0.1
metrics = compute_distribution_shift_metrics(df1, df2)
diagnostics = distribution_diagnostics(df)
# Expect keys
- expected_prefixes = ["reward_total_", "pnl_"]
+ expected_prefixes = ["reward_", "pnl_"]
for prefix in expected_prefixes:
- matching_keys = [
- key for key in diagnostics.keys() if key.startswith(prefix)
- ]
- self.assertGreater(
- len(matching_keys), 0, f"Should have diagnostics for {prefix}"
- )
+ matching_keys = [key for key in diagnostics.keys() if key.startswith(prefix)]
+ self.assertGreater(len(matching_keys), 0, f"Should have diagnostics for {prefix}")
# Basic moments
expected_suffixes = ["mean", "std", "skewness", "kurtosis"]
results = statistical_hypothesis_tests(base)
self.assertIsInstance(results, dict)
- def test_stats_constant_distribution_bootstrap_and_diagnostics(self):
- """Bootstrap on constant columns (degenerate)."""
- df = self._const_df(80)
- res = bootstrap_confidence_intervals(
- df, ["reward_total", "pnl"], n_bootstrap=200, confidence_level=0.95
- )
- for k, (mean, lo, hi) in res.items(): # tuple: mean, low, high
- self.assertAlmostEqualFloat(mean, lo, tolerance=2e-9)
- self.assertAlmostEqualFloat(mean, hi, tolerance=2e-9)
- self.assertLessEqual(hi - lo, 2e-9)
-
def test_stats_js_distance_symmetry_violin(self):
"""JS distance symmetry d(P,Q)==d(Q,P)."""
df1 = self._shift_scale_df(300, shift=0.0)
if js_key is None:
self.skipTest("JS distance key not present in metrics output")
metrics_swapped = compute_distribution_shift_metrics(df2, df1)
- js_key_swapped = next(
- (k for k in metrics_swapped if k.endswith("pnl_js_distance")), None
- )
+ js_key_swapped = next((k for k in metrics_swapped if k.endswith("pnl_js_distance")), None)
self.assertIsNotNone(js_key_swapped)
self.assertAlmostEqualFloat(
metrics[js_key],
rtol=self.TOL_RELATIVE,
)
- def test_stats_js_distance_symmetry_helper(self):
- """JS distance properties: symmetry d(P,Q)=d(Q,P) and upper bound sqrt(log 2)."""
- rng = np.random.default_rng(777)
- p_raw = rng.uniform(0.0, 1.0, size=400)
- q_raw = rng.uniform(0.0, 1.0, size=400)
- p = p_raw / p_raw.sum()
- q = q_raw / q_raw.sum()
-
- def _kl(a: np.ndarray, b: np.ndarray) -> float:
- mask = (a > 0) & (b > 0)
- return float(np.sum(a[mask] * np.log(a[mask] / b[mask])))
-
- def js_distance(a: np.ndarray, b: np.ndarray) -> float:
- m = 0.5 * (a + b)
- js_div = 0.5 * _kl(a, m) + 0.5 * _kl(b, m)
- return math.sqrt(max(js_div, 0.0))
-
- # Symmetry
- self.assertSymmetric(
- js_distance, p, q, atol=self.TOL_IDENTITY_STRICT, rtol=self.TOL_RELATIVE
- )
- # Upper bound
- self.assertLessEqual(
- js_distance(p, q), self.JS_DISTANCE_UPPER_BOUND + self.TOL_IDENTITY_STRICT
- )
-
- def test_stats_bootstrap_shrinkage_with_sample_size(self):
- """Bootstrap CI shrinks ~1/sqrt(n)."""
- small = self._shift_scale_df(80)
- large = self._shift_scale_df(800)
- res_small = bootstrap_confidence_intervals(
- small, ["reward_total"], n_bootstrap=400
- )
- res_large = bootstrap_confidence_intervals(
- large, ["reward_total"], n_bootstrap=400
- )
- (_, lo_s, hi_s) = list(res_small.values())[0]
- (_, lo_l, hi_l) = list(res_large.values())[0]
- hw_small = (hi_s - lo_s) / 2.0
- hw_large = (hi_l - lo_l) / 2.0
- self.assertFinite(hw_small, name="hw_small")
- self.assertFinite(hw_large, name="hw_large")
- self.assertLess(hw_large, hw_small * 0.55)
-
def test_stats_variance_vs_duration_spearman_sign(self):
"""trade_duration up => pnl variance up (rank corr >0)."""
rng = np.random.default_rng(99)
n = 250
trade_duration = np.linspace(1, 300, n)
pnl = rng.normal(0, 1 + trade_duration / 400.0, n)
- df = pd.DataFrame(
- {"trade_duration": trade_duration, "pnl": pnl, "reward_total": pnl}
- )
ranks_dur = pd.Series(trade_duration).rank().to_numpy()
ranks_var = pd.Series(np.abs(pnl)).rank().to_numpy()
rho = np.corrcoef(ranks_dur, ranks_var)[0, 1]
df_a = self._shift_scale_df(120)
df_b = self._shift_scale_df(180, shift=0.2)
m_concat = pd.concat([df_a["pnl"], df_b["pnl"]]).mean()
- m_weighted = (
- df_a["pnl"].mean() * len(df_a) + df_b["pnl"].mean() * len(df_b)
- ) / (len(df_a) + len(df_b))
+ m_weighted = (df_a["pnl"].mean() * len(df_a) + df_b["pnl"].mean() * len(df_b)) / (
+ len(df_a) + len(df_b)
+ )
self.assertAlmostEqualFloat(
m_concat,
m_weighted,
rtol=self.TOL_RELATIVE,
)
- def test_stats_ks_statistic_bounds(self):
- """KS in [0,1]."""
- df1 = self._shift_scale_df(150)
- df2 = self._shift_scale_df(150, shift=0.4)
- metrics = compute_distribution_shift_metrics(df1, df2)
- for k, v in metrics.items():
- if k.endswith("_ks_statistic"):
- self.assertWithin(v, 0.0, 1.0, name=k)
-
def test_stats_bh_correction_null_false_positive_rate(self):
"""Null: low BH discovery rate."""
rng = np.random.default_rng(1234)
df = pd.DataFrame(
{
"pnl": rng.normal(0, 1, n),
- "reward_total": rng.normal(0, 1, n),
+ "reward": rng.normal(0, 1, n),
"idle_duration": rng.exponential(5, n),
}
)
flags.append(bool(v["significant"]))
if flags:
rate = sum(flags) / len(flags)
- self.assertLess(
- rate, 0.15, f"BH null FP rate too high under null: {rate:.3f}"
- )
+ self.assertLess(rate, 0.15, f"BH null FP rate too high under null: {rate:.3f}")
def test_stats_half_life_monotonic_series(self):
"""Smoothed exponential decay monotonic."""
y_smooth = np.convolve(y_noisy, np.ones(window) / window, mode="valid")
self.assertMonotonic(y_smooth, non_increasing=True, tolerance=1e-5)
- def test_stats_bootstrap_confidence_intervals_basic(self):
- """Bootstrap CI calculation (basic)."""
- test_data = self.make_stats_df(n=100, seed=self.SEED)
- results = bootstrap_confidence_intervals(
- test_data,
- ["reward_total", "pnl"],
- n_bootstrap=100,
- )
- for metric, (mean, ci_low, ci_high) in results.items():
- self.assertFinite(mean, name=f"mean[{metric}]")
- self.assertFinite(ci_low, name=f"ci_low[{metric}]")
- self.assertFinite(ci_high, name=f"ci_high[{metric}]")
- self.assertLess(ci_low, ci_high)
-
def test_stats_hypothesis_seed_reproducibility(self):
"""Seed reproducibility for statistical_hypothesis_tests + bootstrap."""
- df = self.make_stats_df(n=300, seed=123, idle_pattern="mixed")
+ df = self.make_stats_df(n=300, seed=self.SEED, idle_pattern="mixed")
r1 = statistical_hypothesis_tests(df, seed=777)
r2 = statistical_hypothesis_tests(df, seed=777)
self.assertEqual(set(r1.keys()), set(r2.keys()))
continue
self.assertEqual(v1, v2, f"Mismatch for {k}:{field}")
# Bootstrap reproducibility
- metrics = ["reward_total", "pnl"]
+ metrics = ["reward", "pnl"]
ci_a = bootstrap_confidence_intervals(df, metrics, n_bootstrap=150, seed=2024)
ci_b = bootstrap_confidence_intervals(df, metrics, n_bootstrap=150, seed=2024)
- self.assertEqual(ci_a, ci_b)
+ # Compare floats with strict identity + scale-aware relative tolerance to avoid flaky exact equality
+ for metric in metrics:
+ m_a, lo_a, hi_a = ci_a[metric]
+ m_b, lo_b, hi_b = ci_b[metric]
+ self.assertAlmostEqualFloat(
+ m_a, m_b, tolerance=self.TOL_IDENTITY_STRICT, rtol=self.TOL_RELATIVE
+ )
+ self.assertAlmostEqualFloat(
+ lo_a, lo_b, tolerance=self.TOL_IDENTITY_STRICT, rtol=self.TOL_RELATIVE
+ )
+ self.assertAlmostEqualFloat(
+ hi_a, hi_b, tolerance=self.TOL_IDENTITY_STRICT, rtol=self.TOL_RELATIVE
+ )
def test_stats_distribution_metrics_mathematical_bounds(self):
"""Mathematical bounds and validity of distribution shift metrics."""
def test_stats_heteroscedasticity_pnl_validation(self):
"""PnL variance increases with trade duration (heteroscedasticity)."""
df = simulate_samples(
+ params=self.base_params(max_trade_duration_candles=100),
num_samples=1000,
seed=123,
- params=self.DEFAULT_PARAMS,
- max_trade_duration=100,
base_factor=self.TEST_BASE_FACTOR,
profit_target=self.TEST_PROFIT_TARGET,
risk_reward_ratio=self.TEST_RR,
"""All statistical functions respect bounds."""
df = self.make_stats_df(n=300, seed=self.SEED, idle_pattern="all_nonzero")
diagnostics = distribution_diagnostics(df)
- for col in ["reward_total", "pnl", "trade_duration", "idle_duration"]:
+ for col in ["reward", "pnl", "trade_duration", "idle_duration"]:
if f"{col}_skewness" in diagnostics:
- self.assertFinite(
- diagnostics[f"{col}_skewness"], name=f"skewness[{col}]"
- )
+ self.assertFinite(diagnostics[f"{col}_skewness"], name=f"skewness[{col}]")
if f"{col}_kurtosis" in diagnostics:
- self.assertFinite(
- diagnostics[f"{col}_kurtosis"], name=f"kurtosis[{col}]"
- )
+ self.assertFinite(diagnostics[f"{col}_kurtosis"], name=f"kurtosis[{col}]")
if f"{col}_shapiro_pval" in diagnostics:
self.assertPValue(
diagnostics[f"{col}_shapiro_pval"],
hypothesis_results = statistical_hypothesis_tests(df, seed=self.SEED)
for test_name, result in hypothesis_results.items():
if "p_value" in result:
- self.assertPValue(
- result["p_value"], msg=f"p-value bounds for {test_name}"
- )
+ self.assertPValue(result["p_value"], msg=f"p-value bounds for {test_name}")
if "effect_size_epsilon_sq" in result:
eps2 = result["effect_size_epsilon_sq"]
self.assertFinite(eps2, name=f"epsilon_sq[{test_name}]")
def test_stats_benjamini_hochberg_adjustment(self):
"""BH adjustment adds p_value_adj & significant_adj with valid bounds."""
df = simulate_samples(
+ params=self.base_params(max_trade_duration_candles=100),
num_samples=600,
seed=123,
- params=self.DEFAULT_PARAMS,
- max_trade_duration=100,
base_factor=self.TEST_BASE_FACTOR,
profit_target=self.TEST_PROFIT_TARGET,
risk_reward_ratio=self.TEST_RR,
pnl_base_std=self.TEST_PNL_STD,
pnl_duration_vol_scale=self.TEST_PNL_DUR_VOL_SCALE,
)
- results_adj = statistical_hypothesis_tests(
- df, adjust_method="benjamini_hochberg", seed=777
- )
+ results_adj = statistical_hypothesis_tests(df, adjust_method="benjamini_hochberg", seed=777)
self.assertGreater(len(results_adj), 0)
for name, res in results_adj.items():
self.assertIn("p_value", res)
pnl=0.02 if expected_type == "exit_component" else 0.0,
trade_duration=50 if position != Positions.Neutral else 0,
idle_duration=10 if position == Positions.Neutral else 0,
- max_trade_duration=100,
max_unrealized_profit=0.03,
min_unrealized_profit=-0.01,
position=position,
ctx = self.make_ctx(
pnl=0.0,
trade_duration=1,
- max_trade_duration=100,
max_unrealized_profit=0.0,
min_unrealized_profit=-0.02,
position=Positions.Long,
pnl=0.0,
trade_duration=0,
idle_duration=40,
- max_trade_duration=128,
- max_unrealized_profit=0.0,
- min_unrealized_profit=0.0,
position=Positions.Neutral,
action=Actions.Neutral,
)
pnl=0.0,
trade_duration=0,
idle_duration=30,
- max_trade_duration=100,
- max_unrealized_profit=0.0,
- min_unrealized_profit=0.0,
position=Positions.Neutral,
action=Actions.Neutral,
)
action_masking=True,
)
self.assertEqual(
- br.idle_penalty, 0.0, "Idle penalty should be zero when profit_target=0"
- )
- self.assertEqual(
- br.total, 0.0, "Total reward should be zero in this configuration"
+ br.idle_penalty,
+ 0.0,
+ "Idle penalty should be zero when profit_target=0",
)
+ self.assertEqual(br.total, 0.0, "Total reward should be zero in this configuration")
def test_win_reward_factor_saturation(self):
"""Saturation test: pnl amplification factor should monotonically approach (1 + win_reward_factor)."""
pnl=pnl,
trade_duration=0, # duration_ratio=0 -> attenuation = 1
idle_duration=0,
- max_trade_duration=100,
max_unrealized_profit=pnl, # neutral wrt efficiency (disabled anyway)
min_unrealized_profit=0.0,
position=Positions.Long,
pnl=0.025,
trade_duration=40,
idle_duration=0,
- max_trade_duration=100,
max_unrealized_profit=0.03,
min_unrealized_profit=0.0,
position=Positions.Long,
pnl=-self.TEST_PNL_STD,
trade_duration=60,
idle_duration=0,
- max_trade_duration=100,
max_unrealized_profit=0.01,
min_unrealized_profit=-0.04,
position=Positions.Long,
pnl=0.0,
trade_duration=0,
idle_duration=35,
- max_trade_duration=120,
max_unrealized_profit=0.0,
min_unrealized_profit=0.0,
position=Positions.Neutral,
pnl=0.0,
trade_duration=80,
idle_duration=0,
- max_trade_duration=100,
max_unrealized_profit=0.04,
min_unrealized_profit=-0.01,
position=Positions.Long,
+ br.idle_penalty
+ br.hold_penalty
+ br.invalid_penalty
- + br.shaping_reward
+ + br.reward_shaping
+ br.entry_additive
+ br.exit_additive
)
pnl=pnl,
trade_duration=55,
idle_duration=0,
- max_trade_duration=100,
max_unrealized_profit=pnl if pnl > 0 else 0.01,
min_unrealized_profit=pnl if pnl < 0 else -0.01,
position=Positions.Long,
pnl=pnl,
trade_duration=55,
idle_duration=0,
- max_trade_duration=100,
max_unrealized_profit=pnl if pnl > 0 else 0.01,
min_unrealized_profit=pnl if pnl < 0 else -0.01,
position=Positions.Short,
def test_api_simulation_and_reward_smoke(self):
df = simulate_samples(
+ params=self.base_params(max_trade_duration_candles=40),
num_samples=20,
seed=7,
- params=self.DEFAULT_PARAMS,
- max_trade_duration=40,
base_factor=self.TEST_BASE_FACTOR,
profit_target=self.TEST_PROFIT_TARGET,
risk_reward_ratio=self.TEST_RR,
pnl=float(row["pnl"]),
trade_duration=int(row["trade_duration"]),
idle_duration=int(row["idle_duration"]),
- max_trade_duration=40,
max_unrealized_profit=float(row["pnl"]) + 0.01,
min_unrealized_profit=float(row["pnl"]) - 0.01,
position=Positions.Long,
def test_simulate_samples_trading_modes_spot_vs_margin(self):
"""simulate_samples coverage: spot should forbid shorts, margin should allow them."""
df_spot = simulate_samples(
+ params=self.base_params(max_trade_duration_candles=100),
num_samples=80,
seed=self.SEED,
- params=self.DEFAULT_PARAMS,
- max_trade_duration=100,
base_factor=self.TEST_BASE_FACTOR,
profit_target=self.TEST_PROFIT_TARGET,
risk_reward_ratio=self.TEST_RR,
pnl_base_std=self.TEST_PNL_STD,
pnl_duration_vol_scale=self.TEST_PNL_DUR_VOL_SCALE,
)
- short_positions_spot = (
- df_spot["position"] == float(Positions.Short.value)
- ).sum()
+ short_positions_spot = (df_spot["position"] == float(Positions.Short.value)).sum()
self.assertEqual(
- short_positions_spot, 0, "Spot mode must not contain short positions"
+ short_positions_spot,
+ 0,
+ "Spot mode must not contain short positions",
)
df_margin = simulate_samples(
+ params=self.base_params(max_trade_duration_candles=100),
num_samples=80,
seed=self.SEED,
- params=self.DEFAULT_PARAMS,
- max_trade_duration=100,
base_factor=self.TEST_BASE_FACTOR,
profit_target=self.TEST_PROFIT_TARGET,
risk_reward_ratio=self.TEST_RR,
"idle_duration",
"position",
"action",
- "reward_total",
+ "reward",
"reward_invalid",
"reward_idle",
"reward_hold",
"""Test _to_bool with various inputs."""
# Test via simulate_samples which uses action_masking parameter
df1 = simulate_samples(
+ params=self.base_params(action_masking="true", max_trade_duration_candles=50),
num_samples=10,
seed=self.SEED,
- params={"action_masking": "true"},
- max_trade_duration=50,
base_factor=self.TEST_BASE_FACTOR,
profit_target=self.TEST_PROFIT_TARGET,
risk_reward_ratio=self.TEST_RR,
self.assertIsInstance(df1, pd.DataFrame)
df2 = simulate_samples(
+ params=self.base_params(action_masking="false", max_trade_duration_candles=50),
num_samples=10,
seed=self.SEED,
- params={"action_masking": "false"},
- max_trade_duration=50,
base_factor=self.TEST_BASE_FACTOR,
profit_target=self.TEST_PROFIT_TARGET,
risk_reward_ratio=self.TEST_RR,
"""Test _is_short_allowed via different trading modes."""
# Test futures mode (shorts allowed)
df_futures = simulate_samples(
+ params=self.base_params(max_trade_duration_candles=50),
num_samples=100,
seed=self.SEED,
- params=self.DEFAULT_PARAMS,
- max_trade_duration=50,
base_factor=self.TEST_BASE_FACTOR,
profit_target=self.TEST_PROFIT_TARGET,
risk_reward_ratio=self.TEST_RR,
# Should have some short positions
short_positions = (df_futures["position"] == float(Positions.Short.value)).sum()
- self.assertGreater(
- short_positions, 0, "Futures mode should allow short positions"
- )
+ self.assertGreater(short_positions, 0, "Futures mode should allow short positions")
def test_get_float_param(self):
"""Test float parameter extraction."""
self.assertEqual(_get_float_param(params, "test_int", 0.0), 2.0)
# Non parseable string -> NaN fallback in tolerant parser
val_str = _get_float_param(params, "test_str", 0.0)
- if isinstance(val_str, float) and math.isnan(val_str):
- pass
- else:
- self.fail("Expected NaN for non-numeric string in _get_float_param")
+ self.assertTrue(isinstance(val_str, float))
+ self.assertTrue(math.isnan(val_str))
self.assertEqual(_get_float_param(params, "missing", 3.14), 3.14)
def test_get_str_param(self):
self.assertEqual(_get_int_param({"k": 9.99}, "k", 0), 9)
self.assertEqual(_get_int_param({"k": -3.7}, "k", 0), -3)
# Non-finite floats fallback
- self.assertEqual(_get_int_param({"k": float("nan")}, "k", 4), 4)
+ self.assertEqual(_get_int_param({"k": np.nan}, "k", 4), 4)
self.assertEqual(_get_int_param({"k": float("inf")}, "k", 4), 4)
# String numerics (int, float, exponent)
self.assertEqual(_get_int_param({"k": "42"}, "k", 0), 42)
# Create comprehensive test data
test_data = simulate_samples(
+ params=self.base_params(max_trade_duration_candles=100),
num_samples=200,
seed=self.SEED,
- params=self.DEFAULT_PARAMS,
- max_trade_duration=100,
base_factor=self.TEST_BASE_FACTOR,
profit_target=self.TEST_PROFIT_TARGET,
risk_reward_ratio=self.TEST_RR,
write_complete_statistical_analysis(
test_data,
output_path,
- max_trade_duration=100,
profit_target=self.TEST_PROFIT_TARGET,
seed=self.SEED,
real_df=None,
# Check that main report is created
main_report = output_path / "statistical_analysis.md"
self.assertTrue(
- main_report.exists(), "Main statistical analysis should be created"
+ main_report.exists(),
+ "Main statistical analysis should be created",
)
# Check for other expected files
feature_file = output_path / "feature_importance.csv"
- self.assertTrue(
- feature_file.exists(), "Feature importance should be created"
- )
+ self.assertTrue(feature_file.exists(), "Feature importance should be created")
class TestPrivateFunctions(RewardSpaceTestBase):
def test_idle_penalty_via_rewards(self):
"""Test idle penalty calculation via reward calculation."""
# Create context that will trigger idle penalty
- context = RewardContext(
+ context = self.make_ctx(
pnl=0.0,
trade_duration=0,
idle_duration=20,
- max_trade_duration=100,
max_unrealized_profit=0.0,
min_unrealized_profit=0.0,
position=Positions.Neutral,
self.assertAlmostEqualFloat(
breakdown.total,
breakdown.idle_penalty
- + breakdown.shaping_reward
+ + breakdown.reward_shaping
+ breakdown.entry_additive
+ breakdown.exit_additive,
tolerance=self.TOL_IDENTITY_RELAXED,
def test_hold_penalty_via_rewards(self):
"""Test hold penalty calculation via reward calculation."""
# Create context that will trigger hold penalty
- context = RewardContext(
+ context = self.make_ctx(
pnl=0.01,
trade_duration=150,
idle_duration=0, # Long duration
- max_trade_duration=100,
max_unrealized_profit=0.02,
min_unrealized_profit=0.0,
position=Positions.Long,
self.assertAlmostEqualFloat(
breakdown.total,
breakdown.hold_penalty
- + breakdown.shaping_reward
+ + breakdown.reward_shaping
+ breakdown.entry_additive
+ breakdown.exit_additive,
tolerance=self.TOL_IDENTITY_RELAXED,
"""Test exit reward calculation with various scenarios."""
scenarios = [
(Positions.Long, Actions.Long_exit, 0.05, "Profitable long exit"),
- (Positions.Short, Actions.Short_exit, -0.03, "Profitable short exit"),
+ (
+ Positions.Short,
+ Actions.Short_exit,
+ -0.03,
+ "Profitable short exit",
+ ),
(Positions.Long, Actions.Long_exit, -0.02, "Losing long exit"),
(Positions.Short, Actions.Short_exit, 0.02, "Losing short exit"),
]
for position, action, pnl, description in scenarios:
with self.subTest(description=description):
- context = RewardContext(
+ context = self.make_ctx(
pnl=pnl,
trade_duration=50,
idle_duration=0,
- max_trade_duration=100,
max_unrealized_profit=max(pnl + 0.01, 0.01),
min_unrealized_profit=min(pnl - 0.01, -0.01),
position=position,
def test_invalid_action_handling(self):
"""Test invalid action penalty."""
# Try to exit long when in short position (invalid)
- context = RewardContext(
+ context = self.make_ctx(
pnl=0.02,
trade_duration=50,
idle_duration=0,
- max_trade_duration=100,
max_unrealized_profit=0.03,
min_unrealized_profit=0.01,
position=Positions.Short,
)
self.assertLess(
- breakdown.invalid_penalty, 0, "Invalid action should have negative penalty"
+ breakdown.invalid_penalty,
+ 0,
+ "Invalid action should have negative penalty",
)
self.assertAlmostEqualFloat(
breakdown.total,
breakdown.invalid_penalty
- + breakdown.shaping_reward
+ + breakdown.reward_shaping
+ breakdown.entry_additive
+ breakdown.exit_additive,
tolerance=self.TOL_IDENTITY_RELAXED,
for trade_duration, description in test_cases:
with self.subTest(duration=trade_duration, desc=description):
- context = RewardContext(
+ context = self.make_ctx(
pnl=0.0, # Neutral PnL to isolate hold penalty
trade_duration=trade_duration,
idle_duration=0,
- max_trade_duration=max_duration,
max_unrealized_profit=0.0,
min_unrealized_profit=0.0,
position=Positions.Long,
self.assertAlmostEqualFloat(
breakdown.total,
breakdown.hold_penalty
- + breakdown.shaping_reward
+ + breakdown.reward_shaping
+ breakdown.entry_additive
+ breakdown.exit_additive,
tolerance=self.TOL_IDENTITY_RELAXED,
def test_hold_penalty_progressive_scaling(self):
"""Test that hold penalty scales progressively after max_duration."""
- max_duration = 100
+ params = self.base_params(max_trade_duration_candles=100)
durations = [150, 200, 300] # All > max_duration
penalties: list[float] = []
for duration in durations:
- context = RewardContext(
+ context = self.make_ctx(
pnl=0.0,
trade_duration=duration,
idle_duration=0,
- max_trade_duration=max_duration,
max_unrealized_profit=0.0,
min_unrealized_profit=0.0,
position=Positions.Long,
breakdown = calculate_reward(
context,
- self.DEFAULT_PARAMS,
+ params,
base_factor=self.TEST_BASE_FACTOR,
profit_target=self.TEST_PROFIT_TARGET,
risk_reward_ratio=self.TEST_RR,
self.assertLessEqual(
penalties[i],
penalties[i - 1],
- f"Penalty should increase with duration: {penalties[i]} > {penalties[i-1]}",
+ f"Penalty should increase with duration: {penalties[i]} > {penalties[i - 1]}",
)
def test_new_invariant_and_warn_parameters(self):
self.assertIn("check_invariants", params)
self.assertIn("exit_factor_threshold", params)
- context = RewardContext(
+ context = self.make_ctx(
pnl=0.05,
trade_duration=300,
idle_duration=0,
- max_trade_duration=100,
max_unrealized_profit=0.06,
min_unrealized_profit=0.0,
position=Positions.Long,
"""Robustness & boundary assertions: invariants, attenuation maths, parameter edges, scaling, warnings."""
def test_decomposition_integrity(self):
- """reward_total must equal the single active core component under mutually exclusive scenarios (idle/hold/exit/invalid)."""
+ """reward must equal the single active core component under mutually exclusive scenarios (idle/hold/exit/invalid)."""
scenarios = [
# Idle penalty only
dict(
- ctx=RewardContext(
+ ctx=self.make_ctx(
pnl=0.0,
trade_duration=0,
idle_duration=25,
- max_trade_duration=100,
max_unrealized_profit=0.0,
min_unrealized_profit=0.0,
position=Positions.Neutral,
),
# Hold penalty only
dict(
- ctx=RewardContext(
+ ctx=self.make_ctx(
pnl=0.0,
trade_duration=150,
idle_duration=0,
- max_trade_duration=100,
max_unrealized_profit=0.0,
min_unrealized_profit=0.0,
position=Positions.Long,
pnl=self.TEST_PROFIT_TARGET,
trade_duration=60,
idle_duration=0,
- max_trade_duration=100,
max_unrealized_profit=0.05,
min_unrealized_profit=0.01,
position=Positions.Long,
),
# Invalid action only
dict(
- ctx=RewardContext(
+ ctx=self.make_ctx(
pnl=0.01,
trade_duration=10,
idle_duration=0,
- max_trade_duration=100,
max_unrealized_profit=0.02,
min_unrealized_profit=0.0,
position=Positions.Short,
active_label: str = sc["active"] # type: ignore[index]
with self.subTest(active=active_label):
# Build parameters disabling shaping and additives to enforce strict decomposition
- params_local = self.base_params(
+ params = self.base_params(
entry_additive_enabled=False,
exit_additive_enabled=False,
hold_potential_enabled=False,
)
br = calculate_reward(
ctx_obj,
- params_local,
+ params,
base_factor=self.TEST_BASE_FACTOR,
profit_target=self.TEST_PROFIT_TARGET,
risk_reward_ratio=self.TEST_RR,
)
# Shaping and additives explicitly disabled
self.assertAlmostEqualFloat(
- br.shaping_reward, 0.0, tolerance=self.TOL_IDENTITY_RELAXED
+ br.reward_shaping, 0.0, tolerance=self.TOL_IDENTITY_RELAXED
)
self.assertAlmostEqualFloat(
br.entry_additive, 0.0, tolerance=self.TOL_IDENTITY_RELAXED
def test_pnl_invariant_exit_only(self):
"""Invariant: only exit actions have non-zero PnL (robustness category)."""
df = simulate_samples(
+ params=self.base_params(max_trade_duration_candles=50),
num_samples=200,
seed=self.SEED,
- params=self.DEFAULT_PARAMS,
- max_trade_duration=50,
base_factor=self.TEST_BASE_FACTOR,
profit_target=self.TEST_PROFIT_TARGET,
risk_reward_ratio=self.TEST_RR,
places=10,
msg="PnL invariant violation: total PnL != sum of exit PnL",
)
- non_zero_pnl_actions = set(df[df["pnl"] != 0]["action"].unique())
+ non_zero_pnl_actions = set(df[df["pnl"].abs() > self.EPS_BASE]["action"].unique())
expected_exit_actions = {2.0, 4.0}
self.assertTrue(
non_zero_pnl_actions.issubset(expected_exit_actions),
f"Non-exit actions have PnL: {non_zero_pnl_actions - expected_exit_actions}",
)
- invalid_combinations = df[(df["pnl"] == 0) & (df["reward_exit"] != 0)]
+ invalid_combinations = df[(df["pnl"].abs() <= self.EPS_BASE) & (df["reward_exit"] != 0)]
self.assertEqual(len(invalid_combinations), 0)
def test_exit_factor_mathematical_formulas(self):
pnl=0.05,
trade_duration=50,
idle_duration=0,
- max_trade_duration=100,
max_unrealized_profit=0.05,
min_unrealized_profit=0.01,
position=Positions.Long,
short_allowed=True,
action_masking=True,
)
- expected_half_life_factor = 2 ** (-duration_ratio / 0.5)
- self.assertPlacesEqual(expected_half_life_factor, 0.5, places=6)
+ # Validate half-life attenuation factor against expected closed-form: 2 ** (-dr / half_life)
+ pnl_factor_hl = _get_pnl_factor(
+ params,
+ context,
+ self.TEST_PROFIT_TARGET,
+ self.TEST_RR,
+ )
+ observed_exit_factor = _get_exit_factor(
+ self.TEST_BASE_FACTOR,
+ context.pnl,
+ pnl_factor_hl,
+ duration_ratio,
+ params,
+ )
+ # Isolate attenuation factor
+ observed_half_life_factor = observed_exit_factor / (
+ self.TEST_BASE_FACTOR * max(pnl_factor_hl, self.EPS_BASE)
+ )
+ expected_half_life_factor = 2 ** (-duration_ratio / params["exit_half_life"])
+ self.assertAlmostEqualFloat(
+ observed_half_life_factor,
+ expected_half_life_factor,
+ tolerance=self.TOL_IDENTITY_RELAXED,
+ msg="Half-life attenuation mismatch: observed vs expected",
+ )
params["exit_attenuation_mode"] = "linear"
params["exit_linear_slope"] = 1.0
reward_linear = calculate_reward(
def test_idle_penalty_fallback_and_proportionality(self):
"""Idle penalty fallback denominator & proportional scaling (robustness)."""
- params = self.base_params(max_idle_duration_candles=None)
+ params = self.base_params(max_idle_duration_candles=None, max_trade_duration_candles=100)
base_factor = 90.0
profit_target = self.TEST_PROFIT_TARGET
risk_reward_ratio = 1.0
pnl=0.0,
trade_duration=0,
idle_duration=20,
- max_trade_duration=100,
position=Positions.Neutral,
action=Actions.Neutral,
)
)
self.assertLess(br_a.idle_penalty, 0.0)
self.assertLess(br_b.idle_penalty, 0.0)
- ratio = (
- br_b.idle_penalty / br_a.idle_penalty if br_a.idle_penalty != 0 else None
- )
+ ratio = br_b.idle_penalty / br_a.idle_penalty if br_a.idle_penalty != 0 else None
self.assertIsNotNone(ratio)
self.assertAlmostEqualFloat(abs(ratio), 2.0, tolerance=0.2)
- ctx_mid = dataclasses.replace(ctx_a, idle_duration=120, max_trade_duration=100)
+ ctx_mid = dataclasses.replace(ctx_a, idle_duration=120)
br_mid = calculate_reward(
ctx_mid,
params,
pnl=0.08,
trade_duration=10,
idle_duration=0,
- max_trade_duration=100,
max_unrealized_profit=0.09,
min_unrealized_profit=0.0,
position=Positions.Long,
)
def test_negative_slope_sanitization(self):
- """Negative exit_linear_slope is sanitized to 1.0; resulting exit factors must match slope=1.0 within tolerance."""
+ """Negative exit_linear_slope is sanitized to 0.0; resulting exit factors must match slope=0.0 within tolerance."""
base_factor = 100.0
pnl = 0.03
pnl_factor = 1.0
duration_ratios = [0.0, 0.2, 0.5, 1.0, 1.5]
params_bad = self.base_params(
- exit_attenuation_mode="linear", exit_linear_slope=-5.0, exit_plateau=False
+ exit_attenuation_mode="linear",
+ exit_linear_slope=-5.0,
+ exit_plateau=False,
)
params_ref = self.base_params(
- exit_attenuation_mode="linear", exit_linear_slope=1.0, exit_plateau=False
+ exit_attenuation_mode="linear",
+ exit_linear_slope=0.0,
+ exit_plateau=False,
)
for dr in duration_ratios:
f_bad = _get_exit_factor(base_factor, pnl, pnl_factor, dr, params_bad)
0.5,
0.25,
1.0,
- ] # include boundary 1.0 => alpha=0 per formula? actually -> -log(1)/log2 = 0
+ ]
for tau in taus:
params = self.base_params(
- exit_attenuation_mode="power", exit_power_tau=tau, exit_plateau=False
+ exit_attenuation_mode="power",
+ exit_power_tau=tau,
+ exit_plateau=False,
)
f0 = _get_exit_factor(base_factor, pnl, pnl_factor, 0.0, params)
f1 = _get_exit_factor(base_factor, pnl, pnl_factor, duration_ratio, params)
else:
alpha = 1.0
expected_ratio = 1.0 / (1.0 + duration_ratio) ** alpha
- observed_ratio = f1 / f0 if f0 != 0 else float("nan")
+ observed_ratio = f1 / f0 if f0 != 0 else np.nan
self.assertFinite(observed_ratio, name="observed_ratio")
self.assertLess(
abs(observed_ratio - expected_ratio),
pnl=0.05,
trade_duration=50,
idle_duration=0,
- max_trade_duration=100,
max_unrealized_profit=0.06,
min_unrealized_profit=0.02,
position=Positions.Long,
for mode in modes:
with self.subTest(mode=mode):
test_params = self.base_params(exit_attenuation_mode=mode)
- ctx = RewardContext(
+ ctx = self.make_ctx(
pnl=0.02,
trade_duration=50,
idle_duration=0,
- max_trade_duration=100,
max_unrealized_profit=0.03,
min_unrealized_profit=0.01,
position=Positions.Long,
params = self.base_params(exit_attenuation_mode="sqrt")
ratios = np.linspace(0, 2, 15)
- values = [
- _get_exit_factor(base_factor, pnl, pnl_factor, r, params)
- for r in ratios
- ]
+ values = [_get_exit_factor(base_factor, pnl, pnl_factor, r, params) for r in ratios]
# Plateau+linear: ignore initial flat region when checking monotonic decrease
if mode == "plateau_linear":
grace = float(params["exit_plateau_grace"]) # type: ignore[index]
filtered = [
- (r, v)
- for r, v in zip(ratios, values)
- if r >= grace - self.TOL_IDENTITY_RELAXED
+ (r, v) for r, v in zip(ratios, values) if r >= grace - self.TOL_IDENTITY_RELAXED
]
values_to_check = [v for _, v in filtered]
else:
pnl = 0.02
pnl_factor = 1.0
# Tau near 1 (minimal attenuation) vs tau near 0 (strong attenuation)
- params_hi = self.base_params(
- exit_attenuation_mode="power", exit_power_tau=0.999999
- )
+ params_hi = self.base_params(exit_attenuation_mode="power", exit_power_tau=0.999999)
params_lo = self.base_params(
exit_attenuation_mode="power",
exit_power_tau=self.MIN_EXIT_POWER_TAU,
pnl = 0.04
pnl_factor = 1.2
ratios = [0.3, 0.6, 1.0, 1.4]
- values = [
- _get_exit_factor(base_factor, pnl, pnl_factor, r, params) for r in ratios
- ]
+ values = [_get_exit_factor(base_factor, pnl, pnl_factor, r, params) for r in ratios]
# All factors should be (approximately) identical after grace (no attenuation)
first = values[0]
for v in values[1:]:
pnl_factor = 1.1
# Ratios straddling 1.0 but below grace=1.5 plus one beyond grace
ratios = [0.8, 1.0, 1.2, 1.4, 1.6]
- vals = [
- _get_exit_factor(base_factor, pnl, pnl_factor, r, params) for r in ratios
- ]
+ vals = [_get_exit_factor(base_factor, pnl, pnl_factor, r, params) for r in ratios]
# All ratios <=1.5 should yield identical factor
ref = vals[0]
for i, r in enumerate(ratios[:-1]): # exclude last (1.6)
self.assertAlmostEqualFloat(
vals[i],
ref,
- self.TOL_IDENTITY_RELAXED,
+ tolerance=self.TOL_IDENTITY_RELAXED,
msg=f"Unexpected attenuation before grace end at ratio {r}",
)
# Last ratio (1.6) should be attenuated (strictly less than ref)
}
)
- left = _get_exit_factor(
- base_factor, pnl, pnl_factor, grace - eps, params
- )
+ left = _get_exit_factor(base_factor, pnl, pnl_factor, grace - eps, params)
boundary = _get_exit_factor(base_factor, pnl, pnl_factor, grace, params)
- right = _get_exit_factor(
- base_factor, pnl, pnl_factor, grace + eps, params
- )
+ right = _get_exit_factor(base_factor, pnl, pnl_factor, grace + eps, params)
self.assertAlmostEqualFloat(
left,
"idle_duration": [5],
"position": [1.0],
"action": [2.0],
- "reward_total": [1.0],
+ "reward": [1.0],
}
)
p = Path(self.temp_dir) / "top.pkl"
"idle_duration": 0,
"position": 1.0,
"action": 2.0,
- "reward_total": 2.0,
+ "reward": 2.0,
}
],
}
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always")
loaded = load_real_episodes(p)
- # Accept variance in warning emission across platforms
_ = w
self.assertEqual(len(loaded), 1)
self.write_pickle(trans, p)
loaded = load_real_episodes(p, enforce_columns=False)
- self.assertIn("reward_total", loaded.columns)
- self.assertTrue(loaded["reward_total"].isna().all())
+ self.assertIn("reward", loaded.columns)
+ self.assertTrue(loaded["reward"].isna().all())
def test_casting_numeric_strings(self):
trans = [
"idle_duration": "0",
"position": "1.0",
"action": "2.0",
- "reward_total": "3.0",
+ "reward": "3.0",
}
]
p = Path(self.temp_dir) / "strs.pkl"
"idle_duration": [5, 0, 8],
"position": [1.0, 0.0, 1.0],
"action": [2.0, 0.0, 2.0],
- "reward_total": [10.5, -5.2, 15.8],
+ "reward": [10.5, -5.2, 15.8],
}
)
p = Path(self.temp_dir) / "test_episodes.pkl"
current_pnl = 0.02
current_dur = 0.5
prev_potential = _compute_hold_potential(current_pnl, current_dur, params)
- _total_reward, shaping_reward, next_potential = apply_potential_shaping(
+ _total_reward, reward_shaping, next_potential = apply_potential_shaping(
base_reward=0.0,
current_pnl=current_pnl,
current_duration_ratio=current_dur,
- next_pnl=0.02,
- next_duration_ratio=0.6,
- is_terminal=True,
- last_potential=prev_potential,
+ next_pnl=0.0,
+ next_duration_ratio=0.0,
+ is_exit=True,
+ is_entry=False,
+ last_potential=0.789, # arbitrary, should be ignored for Φ'
params=params,
)
+ self.assertAlmostEqualFloat(next_potential, 0.0, tolerance=self.TOL_IDENTITY_RELAXED)
self.assertAlmostEqualFloat(
- next_potential, 0.0, tolerance=self.TOL_IDENTITY_RELAXED
- )
- self.assertAlmostEqualFloat(
- shaping_reward, -prev_potential, tolerance=self.TOL_IDENTITY_RELAXED
+ reward_shaping, -prev_potential, tolerance=self.TOL_IDENTITY_RELAXED
)
def test_pbrs_spike_cancel_invariance(self):
"potential_gamma",
DEFAULT_MODEL_REWARD_PARAMETERS.get("potential_gamma", 0.95),
)
- expected_next = (
+ expected_next_potential = (
prev_potential / gamma if gamma not in (0.0, None) else prev_potential
)
- _total_reward, shaping_reward, next_potential = apply_potential_shaping(
+ _total_reward, reward_shaping, next_potential = apply_potential_shaping(
base_reward=0.0,
current_pnl=current_pnl,
current_duration_ratio=current_dur,
- next_pnl=0.016,
- next_duration_ratio=0.45,
- is_terminal=True,
+ next_pnl=0.0,
+ next_duration_ratio=0.0,
+ is_exit=True,
+ is_entry=False,
last_potential=prev_potential,
params=params,
)
self.assertAlmostEqualFloat(
- next_potential, expected_next, tolerance=self.TOL_IDENTITY_RELAXED
+ next_potential, expected_next_potential, tolerance=self.TOL_IDENTITY_RELAXED
)
- self.assertNearZero(shaping_reward, atol=self.TOL_IDENTITY_RELAXED)
+ self.assertNearZero(reward_shaping, atol=self.TOL_IDENTITY_RELAXED)
def test_tanh_transform(self):
"""tanh transform: tanh(x) in (-1, 1)."""
current_duration_ratio=current_duration_ratio,
next_pnl=next_pnl,
next_duration_ratio=next_duration_ratio,
- is_terminal=True,
+ is_exit=True,
+ is_entry=False,
last_potential=0.789, # arbitrary, should be ignored for Φ'
params=params,
)
{"hold_potential_enabled": True, "hold_potential_scale": 1.0},
)
# shaping should equal -current_potential within tolerance
- self.assertAlmostEqual(
- shaping, -current_potential, delta=self.TOL_IDENTITY_RELAXED
- )
+ self.assertAlmostEqual(shaping, -current_potential, delta=self.TOL_IDENTITY_RELAXED)
# Since additives are disabled, total ≈ base_reward + shaping (residual ~0)
residual = total - base_reward - shaping
self.assertAlmostEqual(residual, 0.0, delta=self.TOL_IDENTITY_RELAXED)
# Structural sweep (ensures terminal Φ'==0 and shaping bounded)
terminal_next_potentials, shaping_values = self._canonical_sweep(params)
- # Premier appel (terminal pour forcer chemin exit) pour activer le flag
_t1, _s1, _n1 = apply_potential_shaping(
base_reward=0.0,
current_pnl=0.05,
current_duration_ratio=0.3,
next_pnl=0.0,
next_duration_ratio=0.0,
- is_terminal=True,
+ is_exit=True,
+ is_entry=False,
last_potential=0.4,
params=params,
)
self.assertFalse(params["entry_additive_enabled"])
self.assertFalse(params["exit_additive_enabled"])
if terminal_next_potentials:
- self.assertTrue(
- all(abs(p) < self.PBRS_TERMINAL_TOL for p in terminal_next_potentials)
- )
+ self.assertTrue(all(abs(p) < self.PBRS_TERMINAL_TOL for p in terminal_next_potentials))
max_abs = max(abs(v) for v in shaping_values) if shaping_values else 0.0
self.assertLessEqual(max_abs, self.PBRS_MAX_ABS_SHAPING)
current_duration_ratio=0.1,
next_pnl=0.0,
next_duration_ratio=0.0,
- is_terminal=True,
+ is_exit=True,
+ is_entry=False,
last_potential=0.1,
params=params,
)
current_duration_ratio=0.0,
next_pnl=0.0,
next_duration_ratio=0.0,
- is_terminal=True,
+ is_exit=True,
last_potential=last_potential,
params=params,
)
gamma = float(gamma_raw) # type: ignore[assignment]
except Exception:
gamma = 0.95
- self.assertLessEqual(
- abs(shaping - gamma * last_potential), self.TOL_GENERIC_EQ
- )
+ self.assertLessEqual(
+ abs(shaping - gamma * last_potential),
+ self.TOL_GENERIC_EQ,
+ )
self.assertPlacesEqual(total, shaping, places=12)
def test_potential_gamma_nan_fallback(self):
"""potential_gamma=NaN should fall back to default value (indirect comparison)."""
base_params_dict = self.base_params()
default_gamma = base_params_dict.get("potential_gamma", 0.95)
- params_nan = self.base_params(
- potential_gamma=float("nan"), hold_potential_enabled=True
- )
+ params_nan = self.base_params(potential_gamma=np.nan, hold_potential_enabled=True)
# Non-terminal transition so Φ(s') is computed and depends on gamma
res_nan = apply_potential_shaping(
base_reward=0.1,
current_duration_ratio=0.2,
next_pnl=0.035,
next_duration_ratio=0.25,
- is_terminal=False,
+ is_exit=False,
last_potential=0.0,
params=params_nan,
)
- params_ref = self.base_params(
- potential_gamma=default_gamma, hold_potential_enabled=True
- )
+ params_ref = self.base_params(potential_gamma=default_gamma, hold_potential_enabled=True)
res_ref = apply_potential_shaping(
base_reward=0.1,
current_pnl=0.03,
current_duration_ratio=0.2,
next_pnl=0.035,
next_duration_ratio=0.25,
- is_terminal=False,
+ is_exit=False,
last_potential=0.0,
params=params_ref,
)
except Exception as e: # pragma: no cover
self.fail(f"validate_reward_parameters raised unexpectedly: {e}")
# validate_reward_parameters may return (params, diagnostics) or just params
- if (
- isinstance(validated, tuple)
- and len(validated) >= 1
- and isinstance(validated[0], dict)
- ):
+ if isinstance(validated, tuple) and len(validated) >= 1 and isinstance(validated[0], dict):
validated_params = validated[0]
else:
validated_params = validated # type: ignore[assignment]
- for k in ("potential_gamma", "hold_potential_enabled", "exit_potential_mode"):
+ for k in (
+ "potential_gamma",
+ "hold_potential_enabled",
+ "exit_potential_mode",
+ ):
self.assertIn(k, validated_params, f"Missing key '{k}' in validated params")
# Introduce invalid values
msg="Canonical delta mismatch",
)
# Spike cancel mode
- params_spike = self.base_params(
- exit_potential_mode="spike_cancel", **base_common
- )
+ params_spike = self.base_params(exit_potential_mode="spike_cancel", **base_common)
next_phi_spike = _compute_exit_potential(prev_phi, params_spike)
shaping_spike = gamma * next_phi_spike - prev_phi
self.assertNearZero(
with self.subTest(transform=name):
vals = [apply_transform(name, x) for x in xs]
# Strict bounds (-1,1) (sigmoid & tanh asymptotic)
- self.assertTrue(
- all(-1.0 < v < 1.0 for v in vals), f"{name} out of bounds"
- )
+ self.assertTrue(all(-1.0 < v < 1.0 for v in vals), f"{name} out of bounds")
# Non-decreasing monotonicity
for a, b in zip(vals, vals[1:]):
self.assertLessEqual(
class TestReportFormatting(RewardSpaceTestBase):
- """Tests for report formatting elements not previously covered."""
+ """Tests for report formatting elements not covered elsewhere."""
def test_abs_shaping_line_present_and_constant(self):
"""Abs Σ Shaping Reward line present, formatted, uses constant not literal."""
- # Minimal synthetic construction to exercise invariance formatting logic.
- self.assertPlacesEqual(PBRS_INVARIANCE_TOL, self.TOL_GENERIC_EQ, places=12)
-
# Use small synthetic DataFrame with zero shaping sum (pandas imported globally)
df = pd.DataFrame(
{
- "reward_shaping": [self.TOL_IDENTITY_STRICT, -self.TOL_IDENTITY_STRICT],
+ "reward_shaping": [
+ self.TOL_IDENTITY_STRICT,
+ -self.TOL_IDENTITY_STRICT,
+ ],
"reward_entry_additive": [0.0, 0.0],
"reward_exit_additive": [0.0, 0.0],
}
content = "\n".join(lines)
# Validate formatting pattern using regex
m = re.search(
- r"\| Abs Σ Shaping Reward \| ([0-9]+\.[0-9]{6}e[+-][0-9]{2}) \|", content
+ r"\| Abs Σ Shaping Reward \| ([0-9]+\.[0-9]{6}e[+-][0-9]{2}) \|",
+ content,
)
self.assertIsNotNone(m, "Abs Σ Shaping Reward line missing or misformatted")
# Ensure scientific notation magnitude consistent with small number
val = float(m.group(1)) if m else None # type: ignore[arg-type]
if val is not None:
self.assertLess(val, self.TOL_NEGLIGIBLE + self.TOL_IDENTITY_STRICT)
- # Ensure no stray hard-coded tolerance string inside content
+ # Ensure no hard-coded tolerance string inside content
self.assertNotIn(
str(self.TOL_GENERIC_EQ),
content,
def test_pbrs_non_canonical_report_generation(self):
"""Generate synthetic invariance section with non-zero shaping to assert Non-canonical classification."""
- import re # local lightweight
df = pd.DataFrame(
{
section.append(f"| Note | Total shaping = {total_shaping:.6f} (non-zero) |\n")
section.append(f"| Σ Shaping Reward | {total_shaping:.6f} |\n")
section.append(f"| Abs Σ Shaping Reward | {abs(total_shaping):.6e} |\n")
- section.append(
- f"| Σ Entry Additive | {df['reward_entry_additive'].sum():.6f} |\n"
- )
- section.append(
- f"| Σ Exit Additive | {df['reward_exit_additive'].sum():.6f} |\n"
- )
+ section.append(f"| Σ Entry Additive | {df['reward_entry_additive'].sum():.6f} |\n")
+ section.append(f"| Σ Exit Additive | {df['reward_exit_additive'].sum():.6f} |\n")
content = "".join(section)
self.assertIn("❌ Non-canonical", content)
self.assertRegex(content, r"Σ Shaping Reward \| 0\.008000 \|")
"current_duration_ratio": 0.2,
"next_pnl": 0.012,
"next_duration_ratio": 0.25,
- "is_terminal": False,
+ "is_entry": True,
+ "is_exit": False,
}
_t0, s0, _n0 = apply_potential_shaping(last_potential=0.0, params=base, **ctx)
- t1, s1, _n1 = apply_potential_shaping(
- last_potential=0.0, params=with_add, **ctx
- )
+ t1, s1, _n1 = apply_potential_shaping(last_potential=0.0, params=with_add, **ctx)
self.assertFinite(t1)
self.assertFinite(s1)
# Additives should not alter invariance: shaping difference small
self.assertLess(abs(s1 - s0), 0.2)
- self.assertGreater(
- t1 - _t0, 0.0, "Total reward should increase with additives present"
- )
+ self.assertGreater(t1 - _t0, 0.0, "Total reward should increase with additives present")
def test_report_cumulative_invariance_aggregation(self):
"""Canonical telescoping term: small per-step mean drift, bounded increments."""
max_abs_step = 0.0
steps = 0
for _ in range(500):
- is_terminal = rng.uniform() < 0.1
+ is_exit = rng.uniform() < 0.1
current_pnl = float(rng.normal(0, 0.05))
current_dur = float(rng.uniform(0, 1))
- next_pnl = 0.0 if is_terminal else float(rng.normal(0, 0.05))
- next_dur = 0.0 if is_terminal else float(rng.uniform(0, 1))
+ next_pnl = 0.0 if is_exit else float(rng.normal(0, 0.05))
+ next_dur = 0.0 if is_exit else float(rng.uniform(0, 1))
_tot, _shap, next_potential = apply_potential_shaping(
base_reward=0.0,
current_pnl=current_pnl,
current_duration_ratio=current_dur,
next_pnl=next_pnl,
next_duration_ratio=next_dur,
- is_terminal=is_terminal,
+ is_exit=is_exit,
last_potential=last_potential,
params=params,
)
if abs(inc) > max_abs_step:
max_abs_step = abs(inc)
steps += 1
- if is_terminal:
+ if is_exit:
# Reset potential at terminal per canonical semantics
last_potential = 0.0
else:
last_potential = 0.0
shaping_sum = 0.0
for _ in range(160):
- is_terminal = rng.uniform() < 0.15
- next_pnl = 0.0 if is_terminal else float(rng.normal(0, 0.07))
- next_dur = 0.0 if is_terminal else float(rng.uniform(0, 1))
+ is_exit = rng.uniform() < 0.15
+ next_pnl = 0.0 if is_exit else float(rng.normal(0, 0.07))
+ next_dur = 0.0 if is_exit else float(rng.uniform(0, 1))
_tot, shap, next_pot = apply_potential_shaping(
base_reward=0.0,
current_pnl=float(rng.normal(0, 0.07)),
current_duration_ratio=float(rng.uniform(0, 1)),
next_pnl=next_pnl,
next_duration_ratio=next_dur,
- is_terminal=is_terminal,
+ is_exit=is_exit,
last_potential=last_potential,
params=params,
)
shaping_sum += shap
- last_potential = 0.0 if is_terminal else next_pot
+ last_potential = 0.0 if is_exit else next_pot
self.assertGreater(
abs(shaping_sum),
PBRS_INVARIANCE_TOL * 50,
current_duration_ratio=0.3,
next_pnl=0.025,
next_duration_ratio=0.35,
- is_terminal=False,
+ is_exit=False,
last_potential=0.0,
params=params,
)
"""Degenerate columns produce (mean≈lo≈hi) zero-width intervals."""
df = self._const_df(80)
res = bootstrap_confidence_intervals(
- df, ["reward_total", "pnl"], n_bootstrap=200, confidence_level=0.95
+ df, ["reward", "pnl"], n_bootstrap=200, confidence_level=0.95
)
for k, (mean, lo, hi) in res.items():
self.assertAlmostEqualFloat(mean, lo, tolerance=2e-9)
"""Half-width decreases with larger sample (~1/sqrt(n) heuristic)."""
small = self._shift_scale_df(80)
large = self._shift_scale_df(800)
- res_small = bootstrap_confidence_intervals(
- small, ["reward_total"], n_bootstrap=400
- )
- res_large = bootstrap_confidence_intervals(
- large, ["reward_total"], n_bootstrap=400
- )
+ res_small = bootstrap_confidence_intervals(small, ["reward"], n_bootstrap=400)
+ res_large = bootstrap_confidence_intervals(large, ["reward"], n_bootstrap=400)
(_, lo_s, hi_s) = list(res_small.values())[0]
(_, lo_l, hi_l) = list(res_large.values())[0]
hw_small = (hi_s - lo_s) / 2.0
test_data = self.make_stats_df(n=100, seed=self.SEED)
results = bootstrap_confidence_intervals(
test_data,
- ["reward_total", "pnl"],
+ ["reward", "pnl"],
n_bootstrap=100,
)
for metric, (mean, ci_low, ci_high) in results.items():
self.assertFinite(ci_high, name=f"ci_high[{metric}]")
self.assertLess(ci_low, ci_high)
+ def test_canonical_invariance_flag_and_sum(self):
+ """Canonical mode + no additives -> pbrs_invariant True and Σ shaping ≈ 0."""
+ params = self.base_params(
+ exit_potential_mode="canonical",
+ entry_additive_enabled=False,
+ exit_additive_enabled=False,
+ hold_potential_enabled=True,
+ )
+ df = simulate_samples(
+ params={**params, "max_trade_duration_candles": 100},
+ num_samples=400,
+ seed=self.SEED,
+ base_factor=self.TEST_BASE_FACTOR,
+ profit_target=self.TEST_PROFIT_TARGET,
+ risk_reward_ratio=self.TEST_RR,
+ max_duration_ratio=2.0,
+ trading_mode="margin",
+ pnl_base_std=self.TEST_PNL_STD,
+ pnl_duration_vol_scale=self.TEST_PNL_DUR_VOL_SCALE,
+ )
+ # pbrs_invariant must be True for all samples
+ unique_flags = set(df["pbrs_invariant"].unique().tolist())
+ self.assertEqual(unique_flags, {True}, f"Unexpected invariant flags: {unique_flags}")
+ # Σ shaping ≈ 0 within PBRS_INVARIANCE_TOL
+ total_shaping = float(df["reward_shaping"].sum())
+ self.assertLess(
+ abs(total_shaping),
+ PBRS_INVARIANCE_TOL,
+ f"Canonical invariance violated: Σ shaping = {total_shaping}",
+ )
+
+ def test_non_canonical_flag_false_and_sum_nonzero(self):
+ """Non-canonical exit potential (progressive_release) -> pbrs_invariant False and Σ shaping != 0."""
+ params = self.base_params(
+ exit_potential_mode="progressive_release",
+ exit_potential_decay=0.25,
+ entry_additive_enabled=False,
+ exit_additive_enabled=False,
+ hold_potential_enabled=True,
+ )
+ df = simulate_samples(
+ params={**params, "max_trade_duration_candles": 100},
+ num_samples=400,
+ seed=self.SEED,
+ base_factor=self.TEST_BASE_FACTOR,
+ profit_target=self.TEST_PROFIT_TARGET,
+ risk_reward_ratio=self.TEST_RR,
+ max_duration_ratio=2.0,
+ trading_mode="margin",
+ pnl_base_std=self.TEST_PNL_STD,
+ pnl_duration_vol_scale=self.TEST_PNL_DUR_VOL_SCALE,
+ )
+ unique_flags = set(df["pbrs_invariant"].unique().tolist())
+ self.assertEqual(unique_flags, {False}, f"Unexpected invariant flags: {unique_flags}")
+ total_shaping = float(df["reward_shaping"].sum())
+ self.assertGreater(
+ abs(total_shaping),
+ PBRS_INVARIANCE_TOL * 10,
+ f"Expected non-zero Σ shaping in non-canonical mode (got {total_shaping})",
+ )
+
+
+class TestCsvAndSimulationOptions(RewardSpaceTestBase):
+ """CLI-level tests: CSV encoding and simulate_unrealized_pnl option effects."""
+
+ def test_action_column_integer_in_csv(self):
+ """Ensure 'action' column in reward_samples.csv is encoded as integers."""
+ out_dir = self.output_path / "csv_int_check"
+ cmd = [
+ sys.executable,
+ "reward_space_analysis.py",
+ "--num_samples",
+ "200",
+ "--seed",
+ str(self.SEED),
+ "--out_dir",
+ str(out_dir),
+ ]
+ result = subprocess.run(cmd, capture_output=True, text=True, cwd=Path(__file__).parent)
+ self.assertEqual(result.returncode, 0, f"CLI failed: {result.stderr}")
+ csv_path = out_dir / "reward_samples.csv"
+ self.assertTrue(csv_path.exists(), "Missing reward_samples.csv")
+ df = pd.read_csv(csv_path)
+ self.assertIn("action", df.columns)
+ # All values must be integral and in the expected enum set {0,1,2,3,4}
+ values = df["action"].tolist()
+ self.assertTrue(
+ all(float(v).is_integer() for v in values),
+ "Non-integer values detected in 'action' column",
+ )
+ allowed = {0, 1, 2, 3, 4}
+ self.assertTrue(set(int(v) for v in values).issubset(allowed))
+
+ def test_unrealized_pnl_affects_hold_potential(self):
+ """--unrealized_pnl should alter hold next_potential distribution vs default."""
+ out_default = self.output_path / "sim_default"
+ out_sim = self.output_path / "sim_unrealized"
+ base_args = [
+ "--num_samples",
+ "800",
+ "--seed",
+ str(self.SEED),
+ "--out_dir",
+ ]
+ # Default run
+ cmd_default = [
+ sys.executable,
+ "reward_space_analysis.py",
+ *base_args,
+ str(out_default),
+ ]
+ res_def = subprocess.run(
+ cmd_default,
+ capture_output=True,
+ text=True,
+ cwd=Path(__file__).parent,
+ )
+ self.assertEqual(res_def.returncode, 0, f"CLI default run failed: {res_def.stderr}")
+ # Run with --unrealized_pnl
+ cmd_sim = [
+ sys.executable,
+ "reward_space_analysis.py",
+ *base_args,
+ str(out_sim),
+ "--unrealized_pnl",
+ ]
+ res_sim = subprocess.run(cmd_sim, capture_output=True, text=True, cwd=Path(__file__).parent)
+ self.assertEqual(res_sim.returncode, 0, f"CLI simulated run failed: {res_sim.stderr}")
+
+ # Load CSVs
+ df_def = pd.read_csv(out_default / "reward_samples.csv")
+ df_sim = pd.read_csv(out_sim / "reward_samples.csv")
+ # Hold actions: position in {Long (1.0), Short (0.0)} and action == 0 (Neutral)
+ mask_hold_def = (df_def["action"] == 0) & (df_def["position"].isin([0.0, 1.0]))
+ mask_hold_sim = (df_sim["action"] == 0) & (df_sim["position"].isin([0.0, 1.0]))
+ # Sanity: ensure we have holds in both runs
+ self.assertGreater(int(mask_hold_def.sum()), 0, "No hold samples in default run")
+ self.assertGreater(int(mask_hold_sim.sum()), 0, "No hold samples in simulate run")
+ # Compare mean next_potential on holds: simulated should differ from default
+ mean_next_def = float(df_def.loc[mask_hold_def, "next_potential"].mean())
+ mean_next_sim = float(df_sim.loc[mask_hold_sim, "next_potential"].mean())
+ self.assertFinite(mean_next_def, name="mean_next_def")
+ self.assertFinite(mean_next_sim, name="mean_next_sim")
+ self.assertGreater(
+ abs(mean_next_sim - mean_next_def),
+ self.TOL_GENERIC_EQ,
+ f"No detectable effect of --unrealized_pnl on Φ(s): def={mean_next_def:.6f}, sim={mean_next_sim:.6f}",
+ )
+
+
+class TestParamsPropagation(RewardSpaceTestBase):
+ """Integration tests to validate max_trade_duration_candles propagation via CLI params and dynamic flag."""
+
+ def test_max_trade_duration_candles_propagation_params(self):
+ """--params max_trade_duration_candles=X propagates to manifest and simulation params."""
+ out_dir = self.output_path / "mtd_params"
+ cmd = [
+ sys.executable,
+ "reward_space_analysis.py",
+ "--num_samples",
+ "120",
+ "--seed",
+ str(self.SEED),
+ "--out_dir",
+ str(out_dir),
+ "--params",
+ "max_trade_duration_candles=96",
+ ]
+ result = subprocess.run(cmd, capture_output=True, text=True, cwd=Path(__file__).parent)
+ self.assertEqual(result.returncode, 0, f"CLI failed: {result.stderr}")
+
+ manifest_path = out_dir / "manifest.json"
+ self.assertTrue(manifest_path.exists(), "Missing manifest.json")
+ with open(manifest_path, "r") as f:
+ manifest = json.load(f)
+
+ # Basic structure checks
+ self.assertIn("reward_params", manifest)
+ self.assertIn("simulation_params", manifest)
+
+ # Reward params should include the tunable (float or int acceptable -> coerce)
+ rp = manifest["reward_params"]
+ self.assertIn("max_trade_duration_candles", rp)
+ self.assertEqual(int(rp["max_trade_duration_candles"]), 96)
+
+ def test_max_trade_duration_candles_propagation_flag(self):
+ """Dynamic flag --max_trade_duration_candles X propagates identically."""
+ out_dir = self.output_path / "mtd_flag"
+ cmd = [
+ sys.executable,
+ "reward_space_analysis.py",
+ "--num_samples",
+ "120",
+ "--seed",
+ str(self.SEED),
+ "--out_dir",
+ str(out_dir),
+ "--max_trade_duration_candles",
+ "64",
+ ]
+ result = subprocess.run(cmd, capture_output=True, text=True, cwd=Path(__file__).parent)
+ self.assertEqual(result.returncode, 0, f"CLI failed: {result.stderr}")
+
+ manifest_path = out_dir / "manifest.json"
+ self.assertTrue(manifest_path.exists(), "Missing manifest.json")
+ with open(manifest_path, "r") as f:
+ manifest = json.load(f)
+
+ # Basic structure checks
+ self.assertIn("reward_params", manifest)
+ self.assertIn("simulation_params", manifest)
+
+ # Reward params should include the tunable (float or int acceptable -> coerce)
+ rp = manifest["reward_params"]
+ self.assertIn("max_trade_duration_candles", rp)
+ self.assertEqual(int(rp["max_trade_duration_candles"]), 64)
+
if __name__ == "__main__":
# Configure test discovery and execution
"model_reward_parameters": {
"rr": 2,
"profit_aim": 0.025,
- "win_reward_factor": 2
+ "win_reward_factor": 2,
+ "max_trade_duration_candles": 96 // Maximum trade duration in candles
},
"train_cycles": 25,
"add_state_info": true,
"cpu_count": 4,
"max_training_drawdown_pct": 0.02,
- "max_trade_duration_candles": 96, // Maximum trade duration in candles
"n_envs": 8, // Number of DummyVecEnv or SubProcVecEnv training environments
"multiprocessing": true, // Use SubprocVecEnv if n_envs>1 (otherwise DummyVecEnv)
"frame_stacking": 2, // Number of VecFrameStack stacks (set > 1 to use)
...
"rl_config": {
...
- "max_trade_duration_candles": 96, // Maximum trade duration in candles
"n_envs": 1, // Number of DummyVecEnv or SubProcVecEnv training environments
"n_eval_envs": 1, // Number of DummyVecEnv or SubProcVecEnv evaluation environments
"multiprocessing": false, // Use SubprocVecEnv if n_envs>1 (otherwise DummyVecEnv)
super().__init__(*args, **kwargs)
self._set_observation_space()
self.action_masking: bool = self.rl_config.get("action_masking", False)
- self.max_trade_duration_candles: int = self.rl_config.get(
- "max_trade_duration_candles", 128
- )
+
# === INTERNAL STATE ===
self._last_closed_position: Optional[Positions] = None
self._last_closed_trade_tick: int = 0
self._min_unrealized_profit: float = np.inf
self._last_potential: float = 0.0
# === PBRS INSTRUMENTATION ===
- self._total_shaping_reward: float = 0.0
- self._last_shaping_reward: float = 0.0
+ self._last_prev_potential: float = 0.0
+ self._last_next_potential: float = 0.0
+ self._last_reward_shaping: float = 0.0
+ self._total_reward_shaping: float = 0.0
+ self._last_invalid_penalty: float = 0.0
+ self._last_idle_penalty: float = 0.0
+ self._last_hold_penalty: float = 0.0
+ self._last_exit_reward: float = 0.0
+ self._last_entry_additive: float = 0.0
+ self._total_entry_additive: float = 0.0
+ self._last_exit_additive: float = 0.0
+ self._total_exit_additive: float = 0.0
model_reward_parameters = self.rl_config.get("model_reward_parameters", {})
+ self.max_trade_duration_candles: int = int(
+ model_reward_parameters.get(
+ "max_trade_duration_candles",
+ 128,
+ )
+ )
+ self.max_idle_duration_candles: int = int(
+ model_reward_parameters.get(
+ "max_idle_duration_candles",
+ ReforceXY.DEFAULT_IDLE_DURATION_MULTIPLIER
+ * self.max_trade_duration_candles,
+ )
+ )
# === PBRS COMMON PARAMETERS ===
potential_gamma = model_reward_parameters.get("potential_gamma")
if potential_gamma is None:
if self._exit_potential_mode == "canonical":
if self._entry_additive_enabled or self._exit_additive_enabled:
logger.info(
- "Canonical mode: additive rewards disabled with Φ(terminal)=0. PBRS invariance is preserved. "
+ "PBRS canonical mode: additive rewards disabled with Φ(terminal)=0. PBRS invariance is preserved. "
"To use additive rewards, set exit_potential_mode='non_canonical'."
)
self._entry_additive_enabled = False
elif self._exit_potential_mode == "non_canonical":
if self._entry_additive_enabled or self._exit_additive_enabled:
logger.info(
- "Non-canonical mode: additive rewards enabled with Φ(terminal)=0. PBRS invariance is intentionally broken."
+ "PBRS non-canonical mode: additive rewards enabled with Φ(terminal)=0. PBRS invariance is intentionally broken."
)
if MyRLEnv.is_unsupported_pbrs_config(
self._hold_potential_enabled, getattr(self, "add_state_info", False)
):
logger.warning(
- "PBRS: hold_potential_enabled=True & add_state_info=False is unsupported. PBRS invariance is not guaranteed"
+ "PBRS: hold_potential_enabled=True and add_state_info=False is unsupported. Automatically enabling add_state_info=True."
+ )
+ self.add_state_info = True
+
+ # === PNL TARGET VALIDATION ===
+ pnl_target = self.profit_aim * self.rr
+ if MyRLEnv._is_invalid_pnl_target(pnl_target):
+ raise ValueError(
+ f"Invalid pnl_target={pnl_target:.12g} computed from profit_aim={self.profit_aim:.12g} and rr={self.rr:.12g}"
)
+ self._pnl_target = pnl_target
def _get_next_position(self, action: int) -> Positions:
if action == Actions.Long_enter.value and self._position == Positions.Neutral:
Positions.Long,
Positions.Short,
):
- return next_position, 0, 0.0
+ return next_position, 0, pnl
# Exit
if (
self._position in (Positions.Long, Positions.Short)
# Neutral self-loop
return next_position, 0, 0.0
- def _is_invalid_pnl_target(self, pnl_target: float) -> bool:
- """Check if pnl_target is invalid (negative or close to zero)."""
- return pnl_target < 0.0 or np.isclose(pnl_target, 0.0)
+ @staticmethod
+ def _is_invalid_pnl_target(pnl_target: float) -> bool:
+ """Return True when pnl_target is non-finite, <= 0, or effectively zero within tolerance."""
+ return (
+ (not np.isfinite(pnl_target))
+ or (pnl_target <= 0.0)
+ or np.isclose(pnl_target, 0.0)
+ )
def _compute_pnl_duration_signal(
self,
return 0.0
if require_position and position not in (Positions.Long, Positions.Short):
return 0.0
- if self._is_invalid_pnl_target(pnl_target):
- return 0.0
duration_ratio = 0.0 if duration_ratio < 0.0 else duration_ratio
if duration_ratio > 1.0:
if name == "clip":
return max(-1.0, min(1.0, x))
- logger.info("Unknown potential transform '%s'; falling back to tanh", name)
+ logger.warning("Unknown potential transform '%s'; falling back to tanh", name)
return math.tanh(x)
def _compute_exit_potential(self, prev_potential: float, gamma: float) -> float:
if mode == "progressive_release":
decay = self._exit_potential_decay
if not np.isfinite(decay) or decay < 0.0:
- decay = 0.5
+ decay = 0.0
if decay > 1.0:
decay = 1.0
next_potential = prev_potential * (1.0 - decay)
def is_unsupported_pbrs_config(
hold_potential_enabled: bool, add_state_info: bool
) -> bool:
- """Return True if PBRS potential relies on hidden (non-observed) state.
+ """Return True if PBRS potential relies on hidden state.
Case: hold_potential enabled while auxiliary state info (pnl, trade_duration) is excluded
from the observation space (add_state_info=False). In that situation, Φ(s) uses hidden
potential = self._compute_hold_potential(
next_position, next_duration_ratio, next_pnl, pnl_target
)
- shaping_reward = gamma * potential - prev_potential
+ reward_shaping = gamma * potential - prev_potential
self._last_potential = potential
else:
- shaping_reward = 0.0
+ reward_shaping = 0.0
self._last_potential = 0.0
+ self._last_exit_additive = 0.0
+ self._last_entry_additive = 0.0
entry_additive = 0.0
if self._entry_additive_enabled and not self.is_pbrs_invariant_mode():
entry_additive = self._compute_entry_additive(
pnl_target=pnl_target,
duration_ratio=next_duration_ratio,
)
- self._last_shaping_reward = float(shaping_reward)
- self._total_shaping_reward += float(shaping_reward)
- return base_reward + shaping_reward + entry_additive
+ self._last_entry_additive = float(entry_additive)
+ self._total_entry_additive += float(entry_additive)
+ self._last_reward_shaping = float(reward_shaping)
+ self._total_reward_shaping += float(reward_shaping)
+ self._last_prev_potential = float(prev_potential)
+ self._last_next_potential = float(self._last_potential)
+ return base_reward + reward_shaping + entry_additive
elif is_hold:
if self._hold_potential_enabled:
potential = self._compute_hold_potential(
next_position, next_duration_ratio, next_pnl, pnl_target
)
- shaping_reward = gamma * potential - prev_potential
+ reward_shaping = gamma * potential - prev_potential
self._last_potential = potential
else:
- shaping_reward = 0.0
+ reward_shaping = 0.0
self._last_potential = 0.0
- self._last_shaping_reward = float(shaping_reward)
- self._total_shaping_reward += float(shaping_reward)
- return base_reward + shaping_reward
+ self._last_entry_additive = 0.0
+ self._last_exit_additive = 0.0
+ self._last_reward_shaping = float(reward_shaping)
+ self._total_reward_shaping += float(reward_shaping)
+ self._last_prev_potential = float(prev_potential)
+ self._last_next_potential = float(self._last_potential)
+ return base_reward + reward_shaping
elif is_exit:
if (
self._exit_potential_mode == "canonical"
or self._exit_potential_mode == "non_canonical"
):
next_potential = 0.0
- exit_shaping_reward = -prev_potential
+ exit_reward_shaping = -prev_potential
else:
next_potential = self._compute_exit_potential(prev_potential, gamma)
- exit_shaping_reward = gamma * next_potential - prev_potential
-
+ exit_reward_shaping = gamma * next_potential - prev_potential
+ self._last_entry_additive = 0.0
+ self._last_exit_additive = 0.0
exit_additive = 0.0
if self._exit_additive_enabled and not self.is_pbrs_invariant_mode():
duration_ratio = trade_duration / max(max_trade_duration, 1)
exit_additive = self._compute_exit_additive(
pnl, pnl_target, duration_ratio
)
-
+ self._last_exit_additive = float(exit_additive)
+ self._total_exit_additive += float(exit_additive)
self._last_potential = next_potential
- self._last_shaping_reward = float(exit_shaping_reward)
- self._total_shaping_reward += float(exit_shaping_reward)
- return base_reward + exit_shaping_reward + exit_additive
+ self._last_reward_shaping = float(exit_reward_shaping)
+ self._total_reward_shaping += float(exit_reward_shaping)
+ self._last_prev_potential = float(prev_potential)
+ self._last_next_potential = float(self._last_potential)
+ return base_reward + exit_reward_shaping + exit_additive
else:
# Neutral self-loop
- self._last_potential = 0.0
- self._last_shaping_reward = 0.0
+ self._last_prev_potential = float(prev_potential)
+ self._last_next_potential = float(self._last_potential)
+ self._last_entry_additive = 0.0
+ self._last_exit_additive = 0.0
return base_reward
def _set_observation_space(self) -> None:
self._max_unrealized_profit = -np.inf
self._min_unrealized_profit = np.inf
self._last_potential = 0.0
- self._total_shaping_reward = 0.0
- self._last_shaping_reward = 0.0
+ self._last_prev_potential = 0.0
+ self._last_next_potential = 0.0
+ self._last_reward_shaping = 0.0
+ self._total_reward_shaping = 0.0
+ self._last_entry_additive = 0.0
+ self._total_entry_additive = 0.0
+ self._last_exit_additive = 0.0
+ self._total_exit_additive = 0.0
+ self._last_invalid_penalty = 0.0
+ self._last_idle_penalty = 0.0
+ self._last_hold_penalty = 0.0
+ self._last_exit_reward = 0.0
return observation, history
def _get_exit_factor(
model_reward_parameters.get("exit_plateau_grace", 1.0)
)
if exit_plateau_grace < 0.0:
- exit_plateau_grace = 1.0
+ exit_plateau_grace = 0.0
exit_linear_slope = float(model_reward_parameters.get("exit_linear_slope", 1.0))
if exit_linear_slope < 0.0:
- exit_linear_slope = 1.0
+ exit_linear_slope = 0.0
def _legacy(f: float, dr: float, p: Mapping) -> float:
return f * (1.5 if dr <= 1.0 else 0.5)
def _half_life(f: float, dr: float, p: Mapping) -> float:
hl = float(p.get("exit_half_life", 0.5))
if hl <= 0.0:
- hl = 0.5
+ hl = 0.0
return f * math.pow(2.0, -dr / hl)
strategies: Dict[str, Callable[[float, float, Mapping], float]] = {
)
factor = _linear(factor, effective_dr, model_reward_parameters)
- factor *= self._get_pnl_factor(pnl, self.profit_aim * self.rr)
+ factor *= self._get_pnl_factor(pnl, self._pnl_target)
check_invariants = model_reward_parameters.get("check_invariants", True)
check_invariants = (
return factor
def _get_pnl_factor(self, pnl: float, pnl_target: float) -> float:
- if not np.isfinite(pnl) or not np.isfinite(pnl_target):
+ if not np.isfinite(pnl):
return 0.0
model_reward_parameters = self.rl_config.get("model_reward_parameters", {})
model_reward_parameters = self.rl_config.get("model_reward_parameters", {})
base_reward: Optional[float] = None
+ self._last_invalid_penalty = 0.0
+ self._last_idle_penalty = 0.0
+ self._last_hold_penalty = 0.0
+ self._last_exit_reward = 0.0
+
# 1. Invalid action
if not self.action_masking and not self._is_valid(action):
self.tensorboard_log("invalid", category="actions")
base_reward = float(model_reward_parameters.get("invalid_action", -2.0))
+ self._last_invalid_penalty = float(base_reward)
- max_trade_duration = max(self.max_trade_duration_candles, 1)
+ max_trade_duration = max(1, self.max_trade_duration_candles)
trade_duration = self.get_trade_duration()
duration_ratio = trade_duration / max_trade_duration
base_factor = float(model_reward_parameters.get("base_factor", 100.0))
- pnl_target = self.profit_aim * self.rr
- idle_factor = base_factor * pnl_target / 4.0
+ idle_factor = base_factor * self._pnl_target / 4.0
hold_factor = idle_factor
# 2. Idle penalty
and action == Actions.Neutral.value
and self._position == Positions.Neutral
):
- max_idle_duration = int(
- model_reward_parameters.get(
- "max_idle_duration_candles",
- ReforceXY.DEFAULT_IDLE_DURATION_MULTIPLIER * max_trade_duration,
- )
- )
+ max_idle_duration = max(1, self.max_idle_duration_candles)
idle_penalty_scale = float(
model_reward_parameters.get("idle_penalty_scale", 0.5)
)
* idle_penalty_scale
* idle_duration_ratio**idle_penalty_power
)
+ self._last_idle_penalty = float(base_reward)
# 3. Hold overtime penalty
if (
* hold_penalty_scale
* (duration_ratio - 1.0) ** hold_penalty_power
)
+ self._last_hold_penalty = float(base_reward)
# 4. Exit rewards
pnl = self.get_unrealized_profit()
and self._position == Positions.Long
):
base_reward = pnl * self._get_exit_factor(base_factor, pnl, duration_ratio)
+ self._last_exit_reward = float(base_reward)
if (
base_reward is None
and action == Actions.Short_exit.value
and self._position == Positions.Short
):
base_reward = pnl * self._get_exit_factor(base_factor, pnl, duration_ratio)
+ self._last_exit_reward = float(base_reward)
# 5. Default
if base_reward is None:
trade_duration=trade_duration,
max_trade_duration=max_trade_duration,
pnl=pnl,
- pnl_target=pnl_target,
+ pnl_target=self._pnl_target,
)
def _get_observation(self) -> NDArray[np.float32]:
self._update_portfolio_log_returns()
reward = self.calculate_reward(action)
self.total_reward += reward
- self.tensorboard_log(Actions._member_names_[action], category="actions")
trade_type = self.execute_trade(action)
if trade_type is not None:
self.append_trade_history(trade_type, self.current_price(), pre_pnl)
self._update_max_unrealized_profit(pnl)
self._update_min_unrealized_profit(pnl)
delta_pnl = pnl - pre_pnl
+ max_idle_duration = max(1, self.max_idle_duration_candles)
+ idle_duration = self.get_idle_duration()
+ trade_duration = self.get_trade_duration()
info = {
"tick": self._current_tick,
"position": float(self._position.value),
"most_recent_return": round(self.get_most_recent_return(), 5),
"most_recent_profit": round(self.get_most_recent_profit(), 5),
"total_profit": round(self._total_profit, 5),
- "potential": round(self._last_potential, 5),
- "shaping_reward": round(self._last_shaping_reward, 5),
- "total_shaping_reward": round(self._total_shaping_reward, 5),
+ "prev_potential": round(self._last_prev_potential, 5),
+ "next_potential": round(self._last_next_potential, 5),
+ "reward_entry_additive": round(self._last_entry_additive, 5),
+ "reward_exit_additive": round(self._last_exit_additive, 5),
+ "reward_shaping": round(self._last_reward_shaping, 5),
+ "total_reward_shaping": round(self._total_reward_shaping, 5),
+ "reward_invalid": round(self._last_invalid_penalty, 5),
+ "reward_idle": round(self._last_idle_penalty, 5),
+ "reward_hold": round(self._last_hold_penalty, 5),
+ "reward_exit": round(self._last_exit_reward, 5),
"reward": round(reward, 5),
"total_reward": round(self.total_reward, 5),
"pbrs_invariant": self.is_pbrs_invariant_mode(),
- "idle_duration": self.get_idle_duration(),
- "trade_duration": self.get_trade_duration(),
+ "idle_duration": idle_duration,
+ "idle_ratio": (idle_duration / max_idle_duration),
+ "trade_duration": trade_duration,
+ "duration_ratio": (
+ trade_duration / max(1, self.max_trade_duration_candles)
+ ),
"trade_count": int(len(self.trade_history) // 2),
}
self._update_history(info)
if terminated:
# Enforce Φ(terminal)=0 for PBRS invariance (Wiewiora et al. 2003)
self._last_potential = 0.0
+ eps = 1e-6
+ if self.is_pbrs_invariant_mode() and abs(self._total_reward_shaping) > eps:
+ logger.warning(
+ "PBRS mode %s invariance deviation: |sum Δ|=%.6f > eps=%.6f",
+ self._exit_potential_mode,
+ self._total_reward_shaping,
+ eps,
+ )
return (
self._get_observation(),
reward,
max(int(self.max_system_threads / 4), 1),
),
"storage": "file",
- "continuous": True,
- "warm_start": True,
+ "continuous": False,
+ "warm_start": False,
"n_startup_trials": 15,
"n_trials": 50,
"timeout": 7200,
# "hamming",
# "jaccard",
"jensenshannon",
- # "kulczynski1", # deprecated since version 1.15.0
+ # "kulczynski1", # Deprecated in SciPy ≥ 1.15.0; do not use.
"mahalanobis",
# "matching",
"minkowski",
# "rogerstanimoto",
# "russellrao",
"seuclidean",
- # "sokalmichener", # deprecated since version 1.15.0
+ # "sokalmichener", # Deprecated in SciPy ≥ 1.15.0; do not use.
# "sokalsneath",
"sqeuclidean",
# "yule",