From 5ff19be44681c43b8c26750ac569596aead45277 Mon Sep 17 00:00:00 2001 From: =?utf8?q?J=C3=A9r=C3=B4me=20Benoit?= Date: Wed, 15 Oct 2025 22:25:22 +0200 Subject: [PATCH] docs(reforcexy): cleanup content MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Signed-off-by: Jérôme Benoit --- README.md | 1 + ReforceXY/reward_space_analysis/README.md | 462 +++++------------- .../reward_space_analysis.py | 435 +++-------------- ReforceXY/reward_space_analysis/test_cli.py | 6 +- .../test_reward_space_analysis.py | 1 - 5 files changed, 193 insertions(+), 712 deletions(-) diff --git a/README.md b/README.md index 9f40945..3190572 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,7 @@ - [Configuration tunables](#configuration-tunables) - [ReforceXY](#reforcexy) - [Quick start](#quick-start-1) + - [Supported models](#supported-models) - [Configuration tunables](#configuration-tunables-1) - [Common workflows](#common-workflows) - [Note](#note) diff --git a/ReforceXY/reward_space_analysis/README.md b/ReforceXY/reward_space_analysis/README.md index b784031..631796a 100644 --- a/ReforceXY/reward_space_analysis/README.md +++ b/ReforceXY/reward_space_analysis/README.md @@ -1,122 +1,87 @@ -# 📊 Reward Space Analysis - User Guide +# Reward Space Analysis (ReforceXY) -**Analyze and validate ReforceXY reward logic with synthetic data** +Deterministic synthetic sampling with diagnostics for reward shaping, penalties, PBRS invariance. ---- - -## 🎯 What is this? - -This tool helps you understand and validate how the ReforceXY reinforcement learning environment calculates rewards. It generates synthetic trading scenarios to analyze reward behavior across different market conditions. +## Key Capabilities -### Key Features - -- ✅ Generate thousands of synthetic trading scenarios deterministically -- ✅ Analyze reward distribution, feature importance & partial dependence -- ✅ Built-in invariant & statistical validation layers (fail-fast) -- ✅ PBRS (Potential-Based Reward Shaping) integration with canonical invariance -- ✅ Export reproducible artifacts (parameter hash + execution manifest) -- ✅ Compare synthetic vs real trading data (distribution shift metrics) -- ✅ Parameter bounds validation & automatic sanitization - ---- - -**New to this tool?** Start with [Common Use Cases](#-common-use-cases) then explore [CLI Parameters](#️-cli-parameters-reference). +- Scalable synthetic scenario generation (reproducible) +- Reward component decomposition & bounds checks +- PBRS modes: canonical, non-canonical, progressive_release, spike_cancel, retain_previous +- Feature importance & optional partial dependence +- Statistical tests (hypothesis, bootstrap CIs, distribution diagnostics) +- Real vs synthetic shift metrics +- Manifest + parameter hash ## Table of contents -- [What is this?](#-what-is-this) -- [Key Features](#key-features) -- [Common Use Cases](#-common-use-cases) +- [Key Capabilities](#key-capabilities) +- [Prerequisites](#prerequisites) +- [Common Use Cases](#common-use-cases) - [1. Validate Reward Logic](#1-validate-reward-logic) - - [2. Analyze Parameter Sensitivity](#2-analyze-parameter-sensitivity) - - [3. Debug Reward Issues](#3-debug-reward-issues) - - [4. Compare Real vs Synthetic Data](#4-compare-real-vs-synthetic-data) -- [Prerequisites](#-prerequisites) - - [System Requirements](#system-requirements) - - [Virtual environment setup](#virtual-environment-setup) -- [CLI Parameters Reference](#️-cli-parameters-reference) + - [2. Parameter Sensitivity](#2-parameter-sensitivity) + - [3. Debug Anomalies](#3-debug-anomalies) + - [4. Real vs Synthetic](#4-real-vs-synthetic) +- [CLI Parameters](#cli-parameters) - [Required Parameters](#required-parameters) - - [Core Simulation Parameters](#core-simulation-parameters) + - [Core Simulation](#core-simulation) - [Reward Configuration](#reward-configuration) - - [PnL / Volatility Controls](#pnl--volatility-controls) + - [PnL / Volatility](#pnl--volatility) - [Trading Environment](#trading-environment) - - [Output Configuration](#output-configuration) - - [Reproducibility Model](#reproducibility-model) - - [Direct Tunable Overrides vs `--params`](#direct-tunable-overrides-vs---params) -- [Example Commands](#-example-commands) -- [Understanding Results](#-understanding-results) - - [Main Report](#main-report) + - [Output & Overrides](#output--overrides) + - [Parameter Cheat Sheet](#parameter-cheat-sheet) + - [Exit Attenuation Kernels](#exit-attenuation-kernels) + - [Transform Functions](#transform-functions) + - [Skipping Feature Analysis](#skipping-feature-analysis) + - [Reproducibility](#reproducibility) + - [Overrides vs `--params`](#overrides-vs---params) +- [Examples](#examples) +- [Outputs](#outputs) + - [Main Report](#main-report-statistical_analysismd) - [Data Exports](#data-exports) - - [Manifest Structure (`manifest.json`)](#manifest-structure-manifestjson) - - [Distribution Shift Metric Conventions](#distribution-shift-metric-conventions) -- [Advanced Usage](#-advanced-usage) + - [Manifest](#manifest-manifestjson) + - [Distribution Shift Metrics](#distribution-shift-metrics) +- [Advanced Usage](#advanced-usage) - [Custom Parameter Testing](#custom-parameter-testing) - [Real Data Comparison](#real-data-comparison) - [Batch Analysis](#batch-analysis) -- [Validation & Testing](#-validation--testing) +- [Testing](#testing) - [Run Tests](#run-tests) - - [Code Coverage Analysis](#code-coverage-analysis) + - [Coverage](#coverage) - [When to Run Tests](#when-to-run-tests) - - [Run Specific Test Categories](#run-specific-test-categories) -- [Troubleshooting](#-troubleshooting) - - [No Output Files Generated](#no-output-files-generated) + - [Focused Test Sets](#focused-test-sets) +- [Troubleshooting](#troubleshooting) + - [No Output Files](#no-output-files) - [Unexpected Reward Values](#unexpected-reward-values) - [Slow Execution](#slow-execution) - [Memory Errors](#memory-errors) -## 📦 Prerequisites - -### System Requirements - -- Python 3.8+ -- 4GB RAM minimum (8GB recommended for large analyses) -- No GPU required +## Prerequisites -### Virtual environment setup - -Keep the tooling self-contained by creating a virtual environment directly inside `ReforceXY/reward_space_analysis` and installing packages against it: +Requirements: Python 3.8+, ≥4GB RAM (CPU only). Recommended venv: ```shell -# From the repository root cd ReforceXY/reward_space_analysis python -m venv .venv source .venv/bin/activate -pip install pandas numpy scipy scikit-learn +pip install pandas numpy scipy scikit-learn pytest ``` -Whenever you need to run analyses, activate the environment first and execute: - +Run: ```shell -source .venv/bin/activate -python reward_space_analysis.py --num_samples 20000 --output reward_space_outputs +python reward_space_analysis.py --num_samples 20000 --output out ``` -> Deactivate the environment with `deactivate` when you're done. - -Unless otherwise noted, the command examples below assume your current working directory is `ReforceXY/reward_space_analysis` and the virtual environment is activated. - ---- - -## 💡 Common Use Cases +## Common Use Cases ### 1. Validate Reward Logic -**Goal:** Ensure rewards behave as expected in different scenarios - ```shell python reward_space_analysis.py --num_samples 20000 --output reward_space_outputs ``` -**Check in `statistical_analysis.md`:** +See `statistical_analysis.md` (1–3): positive exit averages (long & short), negative invalid penalties, monotonic idle reduction, zero invariance failures. -- Long/Short exits should have positive average rewards -- Invalid actions should have negative penalties (default: -2.0) -- Idle periods should reduce rewards progressively -- Validation layers report any invariant violations - -### 2. Analyze Parameter Sensitivity - -**Goal:** See how reward parameters affect trading behavior +### 2. Parameter Sensitivity ```shell # Test different win reward factors @@ -137,11 +102,9 @@ python reward_space_analysis.py \ --output pbrs_analysis ``` -**Compare:** Reward distributions between runs in `statistical_analysis.md` - -### 3. Debug Reward Issues +Compare reward distribution & component share deltas across runs. -**Goal:** Identify why your RL agent behaves unexpectedly +### 3. Debug Anomalies ```shell # Generate detailed analysis @@ -150,16 +113,9 @@ python reward_space_analysis.py \ --output debug_analysis ``` -**Look at:** - -- `statistical_analysis.md` - Comprehensive report with: - - Feature importance and model diagnostics - - Statistical significance of relationships - - Hypothesis tests and confidence intervals - -### 4. Compare Real vs Synthetic Data +Focus: feature importance, shaping activation, invariance drift, extremes. -**Goal:** Validate synthetic analysis against real trading +### 4. Real vs Synthetic ```shell # First, collect real episodes (see Advanced Usage section) @@ -172,117 +128,79 @@ python reward_space_analysis.py \ --- -## ⚙️ CLI Parameters Reference +## CLI Parameters ### Required Parameters -None - all parameters have sensible defaults. +None (all have defaults). -### Core Simulation Parameters +### Core Simulation -**`--num_samples`** (int, default: 20000) +**`--num_samples`** (int, default: 20000) – Synthetic scenarios. More = better stats (slower). Recommended: 10k (quick), 50k (standard), 100k+ (deep). -- Number of synthetic trading scenarios to generate -- More samples = more accurate statistics but slower analysis -- Recommended: 10,000 (quick test), 50,000 (standard), 100,000+ (detailed) +**`--seed`** (int, default: 42) – Master seed (reuse for identical runs). -**`--seed`** (int, default: 42) - -- Random seed for reproducibility -- Use same seed to get identical results across runs - -**`--max_trade_duration`** (int, default: 128) - -- Maximum trade duration in candles (from environment config) -- Should match your actual trading environment setting -- Drives idle grace: when `max_idle_duration_candles` fallback = `2 * max_trade_duration` +**`--max_trade_duration`** (int, default: 128) – Max trade duration (candles). Idle grace fallback: `max_idle_duration_candles = 4 * max_trade_duration`. ### Reward Configuration -**`--base_factor`** (float, default: 100.0) - -- Base reward scaling factor (from environment config) -- Should match your environment's base_factor +**`--base_factor`** (float, default: 100.0) – Base reward scale (match environment). -**`--profit_target`** (float, default: 0.03) +**`--profit_target`** (float, default: 0.03) – Target profit (e.g. 0.03=3%) for exit reward. -- Target profit threshold as decimal (e.g., 0.03 = 3%) -- Used for exit reward +**`--risk_reward_ratio`** (float, default: 1.0) – Adjusts effective profit target. -**`--risk_reward_ratio`** (float, default: 1.0) +**`--max_duration_ratio`** (float, default: 2.5) – Upper multiple for sampled trade/idle durations (higher = more variety). -- Risk/reward ratio multiplier -- Affects profit target adjustment in reward calculations +### PnL / Volatility -**`--max_duration_ratio`** (float, default: 2.5) +Controls synthetic PnL variance (heteroscedastic; grows with duration): -- Multiple of max_trade_duration used for sampling trade/idle durations -- Higher = more variety in duration scenarios +**`--pnl_base_std`** (float, default: 0.02) – Volatility floor. -### PnL / Volatility Controls - -These parameters shape the synthetic PnL generation process and heteroscedasticity (variance increasing with duration): - -**`--pnl_base_std`** (float, default: 0.02) -- Base standard deviation (volatility floor) for generated PnL before duration scaling. - -**`--pnl_duration_vol_scale`** (float, default: 0.5) -- Multiplicative scaling of additional volatility proportional to (trade_duration / max_trade_duration). -- Higher values = stronger heteroscedastic effect. +**`--pnl_duration_vol_scale`** (float, default: 0.5) – Extra volatility × (duration/max_trade_duration). Higher ⇒ stronger. ### Trading Environment -**`--trading_mode`** (choice: spot|margin|futures, default: spot) - -- **spot**: Disables short selling -- **margin**: Enables short positions -- **futures**: Enables short positions - -**`--action_masking`** (choice: true|false|1|0|yes|no, default: true) - -- Enable/disable action masking simulation -- Should match your environment configuration - -### Output Configuration - -**`--output`** (path, default: reward_space_outputs) +**`--trading_mode`** (spot|margin|futures, default: spot) – spot: no shorts; margin/futures: shorts enabled. -- Output directory for all generated files -- Will be created if it doesn't exist +**`--action_masking`** (bool, default: true) – Simulate action masking (match environment). -**`--params`** (key=value pairs) +### Output & Overrides -- Override any reward parameter from DEFAULT_MODEL_REWARD_PARAMETERS -- Format: `--params key1=value1 key2=value2` -- Example: `--params win_reward_factor=3.0 idle_penalty_scale=2.0` +**`--output`** (path, default: reward_space_outputs) – Output directory (auto-created). -**All tunable parameters (override with --params):** +**`--params`** (k=v ...) – Override reward params. Example: `--params win_reward_factor=3.0 idle_penalty_scale=2.0`. -_Invalid action penalty:_ +All tunables mirror `DEFAULT_MODEL_REWARD_PARAMETERS`. Flags or `--params` (wins on conflicts). -- `invalid_action` (default: -2.0) - Penalty for invalid actions +### Parameter Cheat Sheet -_Idle penalty configuration:_ +Core frequently tuned parameters: -- `idle_penalty_scale` (default: 0.5) - Scale of idle penalty -- `idle_penalty_power` (default: 1.025) - Power applied to idle penalty scaling +| Parameter | Default | Description | +|-----------|---------|-------------| +| `win_reward_factor` | 2.0 | Profit overshoot multiplier | +| `pnl_factor_beta` | 0.5 | PnL amplification beta | +| `idle_penalty_scale` | 0.5 | Idle penalty scale | +| `idle_penalty_power` | 1.025 | Idle penalty exponent (>1 slightly convex) | +| `max_idle_duration_candles` | None | Idle duration cap; fallback 4× max trade duration | +| `hold_penalty_scale` | 0.25 | Hold penalty scale | +| `hold_penalty_power` | 1.025 | Hold penalty exponent | +| `exit_attenuation_mode` | linear | Exit attenuation kernel | +| `exit_plateau` | true | Flat region before attenuation starts | +| `exit_plateau_grace` | 1.0 | Plateau duration ratio grace | +| `exit_linear_slope` | 1.0 | Linear kernel slope | +| `exit_power_tau` | 0.5 | Tau controlling power kernel decay (0,1] | +| `exit_half_life` | 0.5 | Half-life for half_life kernel | +| `potential_gamma` | 0.9 | PBRS discount γ | +| `exit_potential_mode` | canonical | Exit potential mode | +| `efficiency_weight` | 1.0 | Efficiency contribution weight | +| `efficiency_center` | 0.5 | Efficiency pivot in [0,1] | -_Hold penalty configuration:_ +For full list & exact defaults see `reward_space_analysis.py` (`DEFAULT_MODEL_REWARD_PARAMETERS`). -- `hold_penalty_scale` (default: 0.25) - Scale of hold penalty -- `hold_penalty_power` (default: 1.025) - Power applied to hold penalty scaling - -_Exit attenuation configuration:_ - -- `exit_attenuation_mode` (default: linear) - Selects attenuation kernel (see table below: legacy|sqrt|linear|power|half_life). Fallback to linear. -- `exit_plateau` (default: true) - Enables plateau (no attenuation until `exit_plateau_grace`). -- `exit_plateau_grace` (default: 1.0) - Duration ratio boundary of full-strength region (may exceed 1.0). -- `exit_linear_slope` (default: 1.0) - Slope parameter used only when mode = linear. -- `exit_power_tau` (default: 0.5) - Tau ∈ (0,1]; internally mapped to alpha (see kernel table). -- `exit_half_life` (default: 0.5) - Half-life parameter for the half_life kernel. -- `exit_factor_threshold` (default: 10000.0) - Warning-only soft threshold (emits RuntimeWarning; no capping). - -**Attenuation kernels**: +### Exit Attenuation Kernels Let r be the raw duration ratio and grace = `exit_plateau_grace`. @@ -302,38 +220,7 @@ effective_r = r if not exit_plateau Where r* = `effective_r` above. -_Efficiency configuration:_ - -- `efficiency_weight` (default: 1.0) - Weight for efficiency factor in exit reward -- `efficiency_center` (default: 0.5) - Linear pivot in [0,1] for efficiency ratio. If efficiency_ratio > center ⇒ amplification (>1); if < center ⇒ attenuation (<1, floored at 0). - -_Profit factor configuration:_ - -- `win_reward_factor` (default: 2.0) - Asymptotic bonus multiplier for PnL above target. Raw `profit_target_factor` ∈ [1, 1 + win_reward_factor] (tanh bounds it); overall amplification may exceed this once multiplied by `efficiency_factor`. -- `pnl_factor_beta` (default: 0.5) - Sensitivity of amplification around target - -_PBRS (Potential-Based Reward Shaping) configuration:_ - -- `potential_gamma` (default: 0.95) - Discount factor γ for PBRS potential term (0 ≤ γ ≤ 1) -- `exit_potential_mode` (default: canonical) - Exit potential mode: 'canonical' (Φ=0, preserves invariance, disables additives), 'non-canonical' (Φ=0, allows additives, breaks invariance), 'progressive_release', 'spike_cancel', 'retain_previous' -- `exit_potential_decay` (default: 0.5) - Decay factor for progressive_release exit mode (0 ≤ decay ≤ 1) -- `hold_potential_enabled` (default: true) - Enable PBRS hold potential function Φ(s) -- `hold_potential_scale` (default: 1.0) - Scale factor for hold potential function -- `hold_potential_gain` (default: 1.0) - Gain factor applied before transforms in hold potential -- `hold_potential_transform_pnl` (default: tanh) - Transform function for PnL: tanh, softsign, arctan, sigmoid, asinh, clip -- `hold_potential_transform_duration` (default: tanh) - Transform function for duration ratio -- `entry_additive_enabled` (default: false) - Enable entry additive reward (non-PBRS component) -- `entry_additive_scale` (default: 1.0) - Scale factor for entry additive reward -- `entry_additive_gain` (default: 1.0) - Gain factor for entry additive reward -- `entry_additive_transform_pnl` (default: tanh) - Transform function for PnL in entry additive -- `entry_additive_transform_duration` (default: tanh) - Transform function for duration ratio in entry additive -- `exit_additive_enabled` (default: false) - Enable exit additive reward (non-PBRS component) -- `exit_additive_scale` (default: 1.0) - Scale factor for exit additive reward -- `exit_additive_gain` (default: 1.0) - Gain factor for exit additive reward -- `exit_additive_transform_pnl` (default: tanh) - Transform function for PnL in exit additive -- `exit_additive_transform_duration` (default: tanh) - Transform function for duration ratio in exit additive - -**Transform functions**: +### Transform Functions | Transform | Formula | Range | Characteristics | Use Case | |-----------|---------|-------|-----------------|----------| @@ -344,73 +231,26 @@ _PBRS (Potential-Based Reward Shaping) configuration:_ | `asinh` | x / sqrt(1 + x^2) | (-1, 1) | Normalized asinh-like transform | Extreme outlier robustness | | `clip` | clip(x, -1, 1) | [-1, 1] | Hard clipping at ±1 | Preserve linearity within bounds | -_Invariant / safety controls:_ - -- `check_invariants` (default: true) - Enable/disable runtime invariant & safety validations (simulation invariants, mathematical bounds, distribution checks). Set to `false` only for performance experiments; not recommended for production validation. - -**`--real_episodes`** (path, optional) - -- Path to real episode rewards pickle file for distribution comparison -- Enables distribution shift analysis (KL(synthetic‖real), JS distance, Wasserstein distance, KS test) -- Example: `path/to/episode_rewards.pkl` - -**`--pvalue_adjust`** (choice: none|benjamini_hochberg, default: none) - -- Apply multiple hypothesis testing correction (Benjamini–Hochberg) to p-values in statistical hypothesis tests. -- When set to `benjamini_hochberg`, adjusted p-values and adjusted significance flags are added to the reports. - -**`--stats_seed`** (int, optional; default: inherit `--seed`) - -- Dedicated seed for statistical analyses (hypothesis tests, bootstrap confidence intervals, distribution diagnostics). -- Use this if you want to generate multiple independent statistical analyses over the same synthetic dataset without re-simulating samples. -- If omitted, falls back to `--seed` for full run determinism. - -**`--strict_diagnostics`** (flag, default: disabled) - -Fail-fast switch controlling handling of degenerate statistical situations: +Invariant toggle: disable only for performance experiments (diagnostics become advisory). -| Condition | Graceful (default) | Strict (`--strict_diagnostics`) | -|-----------|--------------------|---------------------------------| -| Zero-width bootstrap CI | Widen by epsilon (~1e-9) + warning | Abort (AssertionError) | -| NaN skewness/kurtosis (constant distribution) | Replace with 0.0 + warning | Abort | -| NaN Anderson statistic (constant distribution) | Replace with 0.0 + warning | Abort | -| NaN Q-Q R² (constant distribution) | Replace with 1.0 + warning | Abort | +### Skipping Feature Analysis -Use strict mode in CI or research contexts requiring hard guarantees; keep default for exploratory analysis to avoid aborting entire runs on trivial constants. +**`--skip_partial_dependence`**: skip PD curves (faster). -**`--bootstrap_resamples`** (int, default: 10000) - -- Number of bootstrap resamples used for confidence intervals (percentile method). -- Lower values (< 500) yield coarse intervals; a warning (RewardDiagnosticsWarning) is emitted if below internal recommended minimum (currently 200) to help with very fast exploratory runs. -- Increase for more stable interval endpoints (typical: 5000–20000). Runtime scales roughly linearly. - -**`--skip_partial_dependence`** (flag, default: disabled) - -- When set, skips computation and export of partial dependence CSV files, reducing runtime (often 30–60% faster for large sample sizes) at the cost of losing marginal response curve inspection. -- Feature importance (RandomForest Gini importance + permutation importance) is still computed. - -**`--skip-feature-analysis`** (flag, default: disabled) - -- Skips the entire model-based feature analysis block: no RandomForest training, no permutation importance, no feature_importance.csv, no partial_dependence_*.csv (regardless of `--skip_partial_dependence`). -- Automatically suppresses any partial dependence computation even if `--skip_partial_dependence` is not provided (hard superset). -- Useful for ultra-fast smoke / CI runs or very low sample exploratory checks (e.g. `--num_samples < 4`) where the model would not be statistically meaningful. +**`--skip_feature-analysis`**: skip model, importance, PD. Hierarchy / precedence of skip flags: -| Scenario | `--skip-feature-analysis` | `--skip_partial_dependence` | Feature Importance | Partial Dependence | Report Section 4 | +| Scenario | `--skip_feature-analysis` | `--skip_partial_dependence` | Feature Importance | Partial Dependence | Report Section 4 | |----------|---------------------------|-----------------------------|--------------------|-------------------|------------------| | Default (no flags) | ✗ | ✗ | Yes | Yes | Full (R², top features, exported data) | | PD only skipped | ✗ | ✓ | Yes | No | Full (PD line shows skipped note) | | Feature analysis skipped | ✓ | ✗ | No | No | Marked “(skipped)” with reason(s) | | Both flags | ✓ | ✓ | No | No | Marked “(skipped)” + note PD redundant | -Additional notes: +Auto-skip if `num_samples < 4`. -- If `--num_samples < 4`, feature analysis is automatically skipped (insufficient rows to perform train/test split) and the summary marks the section as skipped with reason. -- Providing `--skip_partial_dependence` together with `--skip-feature-analysis` is harmless; the report clarifies redundancy. -- Skipping feature analysis reduces runtime and memory footprint significantly for large `--num_samples` (avoid building a 400-tree forest + permutation loops). - -### Reproducibility Model +### Reproducibility | Component | Controlled By | Notes | |-----------|---------------|-------| @@ -419,7 +259,7 @@ Additional notes: | RandomForest & permutation importance | `--seed` | Ensures identical splits and tree construction. | | Partial dependence grids | Deterministic | Depends only on fitted model & data. | -Common patterns: +Patterns: ```shell # Same synthetic data, two different statistical re-analysis runs python reward_space_analysis.py --num_samples 50000 --seed 123 --stats_seed 9001 --output run_stats1 @@ -429,9 +269,9 @@ python reward_space_analysis.py --num_samples 50000 --seed 123 --stats_seed 9002 python reward_space_analysis.py --num_samples 50000 --seed 777 ``` -### Direct Tunable Overrides vs `--params` +### Overrides vs `--params` -All reward parameters are also available as individual CLI flags. You may choose either style: +Reward parameters also have individual flags: ```shell # Direct flag style @@ -441,9 +281,9 @@ python reward_space_analysis.py --win_reward_factor 3.0 --idle_penalty_scale 2.0 python reward_space_analysis.py --params win_reward_factor=3.0 idle_penalty_scale=2.0 --num_samples 15000 ``` -If both a direct flag and the same key in `--params` are provided, the `--params` value takes highest precedence. +`--params` wins on conflicts. -## 📝 Example Commands +## Examples ```shell # Quick test with defaults @@ -477,28 +317,11 @@ python reward_space_analysis.py \ --- -## 📊 Understanding Results - -The analysis generates the following output files: - -### Main Report - -**`statistical_analysis.md`** - Comprehensive statistical analysis containing: +## Outputs -1. **Global Statistics** - Reward distribution, per-action stats, component activation & ranges. -2. **Sample Representativity** - Position/action distributions, critical regime coverage, component activation recap. -3. **Reward Component Analysis** - Binned relationships (idle, hold, exit), correlation matrix (constant features removed), PBRS analysis (activation rates, component stats, invariance summary). -4. **Feature Importance** - Random Forest importance + partial dependence. -5. **Statistical Validation** - Hypothesis tests, bootstrap confidence intervals, normality diagnostics, optional distribution shift (5.4) when real episodes provided. +### Main Report (`statistical_analysis.md`) -**Summary** - 7-point concise synthesis: -1. Reward distribution health (center, spread, tail asymmetry) -2. Action & position coverage (usage %, invalid rate, masking efficacy) -3. Component contributions (activation rates + mean / |mean| ranking) -4. Exit attenuation behavior (mode, continuity, effective decay characteristics) -5. Feature signal quality (model R², leading predictors, stability notes) -6. Statistical outcomes (significant correlations / tests, any multiple-testing adjustment applied, distribution shift if real data) -7. PBRS invariance verdict (|Σ shaping| < 1e-6 => canonical; otherwise non-canonical with absolute deviation) +Includes: global stats, representativity, component + PBRS analysis, feature importance/PD, statistical validation (tests, CIs, diagnostics), optional shift metrics, summary. ### Data Exports @@ -509,7 +332,7 @@ The analysis generates the following output files: | `partial_dependence_*.csv` | Partial dependence data for key features | | `manifest.json` | Runtime manifest (simulation + reward params + hash) | -### Manifest Structure (`manifest.json`) +### Manifest (`manifest.json`) | Field | Type | Description | |-------|------|-------------| @@ -524,9 +347,9 @@ The analysis generates the following output files: | `simulation_params` | object | All simulation inputs (num_samples, seed, volatility knobs, etc.). | | `params_hash` | string (sha256) | Hash over ALL `simulation_params` + ALL `reward_params` (lexicographically ordered). | -Reproducibility: two runs are input-identical iff their `params_hash` values match. Because defaults are included in the hash, modifying a default value (even if not overridden) changes the hash. +Two runs match iff `params_hash` identical (defaults included in hash scope). -### Distribution Shift Metric Conventions +### Distribution Shift Metrics | Metric | Definition | Notes | |--------|------------|-------| @@ -536,19 +359,15 @@ Reproducibility: two runs are input-identical iff their `params_hash` values mat | `*_ks_statistic` | KS two-sample statistic | ∈ [0,1]; higher = greater divergence. | | `*_ks_pvalue` | KS test p-value | ∈ [0,1]; small ⇒ reject equality (at α). | -Implementation details: -- Histograms: 50 uniform bins spanning min/max across both samples. -- Probabilities: counts + ε (1e-10) then normalized ⇒ avoids log(0) and division by zero. -- Degenerate distributions short-circuit to zeros / p-value 1.0. -- JS distance instead of raw JS divergence for bounded interpretability and smooth interpolation. +Implementation: 50-bin hist; add ε=1e-10 before normalizing; constants ⇒ zero divergence, KS p=1.0. --- -## 🔬 Advanced Usage +## Advanced Usage ### Custom Parameter Testing -Test different reward parameter configurations to understand their impact: +Test reward parameter configurations: ```shell # Test power-based exit attenuation with custom tau @@ -583,7 +402,7 @@ python reward_space_analysis.py \ ### Real Data Comparison -For production validation, compare synthetic analysis with real trading episodes: +Compare with real trading episodes: ```shell python reward_space_analysis.py \ @@ -592,7 +411,7 @@ python reward_space_analysis.py \ --output real_vs_synthetic ``` -The report will include distribution shift metrics (KL divergence ≥ 0, JS distance ∈ [0,1], Wasserstein ≥ 0, KS statistic ∈ [0,1], KS p-value ∈ [0,1]) showing how well synthetic samples represent real trading. Degenerate (constant) distributions are auto-detected and produce zero divergence and KS p-value = 1.0 to avoid spurious instability. +Shift metrics: lower is better (except p-value: higher ⇒ cannot reject equality). ### Batch Analysis @@ -608,7 +427,7 @@ done --- -## 🧪 Validation & Testing +## Testing ### Run Tests @@ -619,7 +438,7 @@ pip install pytest packaging pytest -q ``` -### Code Coverage Analysis +### Coverage ```shell pip install pytest-cov @@ -635,7 +454,7 @@ pytest -q --cov=. --cov-report=html # open htmlcov/index.html - After updating dependencies or Python version - When contributing new features (aim for >80% coverage on new code) -### Run Specific Test Categories +### Focused Test Sets ```shell pytest -q test_reward_space_analysis.py::TestIntegration @@ -646,52 +465,21 @@ pytest -q test_reward_space_analysis.py::TestRewardAlignment --- -## 🆘 Troubleshooting +## Troubleshooting -### No Output Files Generated +### No Output Files -**Symptom:** Script completes but no files in output directory - -**Solution:** - -- Check write permissions in output directory -- Ensure sufficient disk space (min 100MB free) -- Verify Python path is correct +Check permissions, disk space, working directory. ### Unexpected Reward Values -**Symptom:** Rewards don't match expected behavior - -**Solution:** - -- Run `test_reward_space_analysis.py` to validate logic -- Review parameter overrides with `--params` -- Check trading mode settings (spot vs margin/futures) -- Verify `base_factor` matches your environment config -- Check PBRS settings: `hold_potential_enabled`, `exit_potential_mode`, and transform functions -- Review parameter adjustments in output logs for any automatic bound clamping +Run tests; inspect overrides; confirm trading mode, PBRS settings, clamps. ### Slow Execution -**Symptom:** Analysis takes excessive time to complete - -**Solution:** - -- Reduce `--num_samples` (start with 10,000) -- Use `--trading_mode spot` (fewer action combinations) -- Close other memory-intensive applications -- Use SSD storage for faster I/O -- Use `--skip_partial_dependence` to skip marginal response curves -- Temporarily lower `--bootstrap_resamples` (e.g. 1000) during iteration (expect wider CIs) +Lower samples; skip PD/feature analysis; reduce resamples; ensure SSD. ### Memory Errors -**Symptom:** `MemoryError` or system freeze - -**Solution:** - -- Reduce sample size to 10,000-20,000 -- Use 64-bit Python installation -- Add more RAM or configure swap file -- Process data in batches for custom analyses +Reduce samples; ensure 64‑bit Python; batch processing; add RAM/swap. diff --git a/ReforceXY/reward_space_analysis/reward_space_analysis.py b/ReforceXY/reward_space_analysis/reward_space_analysis.py index b15e5ba..42b344b 100644 --- a/ReforceXY/reward_space_analysis/reward_space_analysis.py +++ b/ReforceXY/reward_space_analysis/reward_space_analysis.py @@ -1,41 +1,12 @@ #!/usr/bin/env python3 -"""Synthetic reward space analysis for the ReforceXY environment. - -Capabilities: -- Hypothesis testing (Spearman, Kruskal-Wallis, Mann-Whitney). -- Percentile bootstrap confidence intervals (BCa not yet implemented). -- Distribution diagnostics (Shapiro, Anderson, skewness, kurtosis, Q-Q R²). -- Distribution shift metrics (KL divergence, JS distance, Wasserstein, KS test) with - degenerate (constant) distribution safeguards. -- Unified RandomForest feature importance + partial dependence. -- Heteroscedastic PnL simulation (variance scales with duration). - -Exit attenuation mode normalization: -- User supplied ``exit_attenuation_mode`` is taken as-is (case-sensitive) and validated - against the allowed set. Any invalid value (including casing mismatch) results in a - silent fallback to ``'linear'``. - -Architecture principles: -- Single source of truth: ``DEFAULT_MODEL_REWARD_PARAMETERS`` (dynamic CLI generation). -- Determinism: explicit seeding, parameter hashing for manifest traceability. -- Extensibility: modular helpers (sampling, reward calculation, statistics, reporting). -""" +"""Synthetic reward space analysis utilities for ReforceXY. -# --------------------------------------------------------------------------- -# Module Layout -# --------------------------------------------------------------------------- -# Actual order in this module (kept for conceptual proximity): -# 1. Imports & global constants -# 2. Enums & type aliases -# 3. Default parameter definitions & validation utilities -# 4. Core generic helpers (parsing, coercion, safety) -# 5. Dataclasses (RewardContext, RewardBreakdown) -# 6. Reward computation primitives (penalties, exit factor, pnl factor, calculate_reward) -# 7. Simulation utilities (sampling, invariants) -# 8. Statistical / analytical helpers (summary stats, binned stats, tests, diagnostics) -# 9. PBRS transforms, helpers & implementation (potential shaping logic) -# 10. CLI construction & reporting (argument parser, report writer, report generator) -# 11. main() entry point +Features: +* Sample generation + reward computation (incl. PBRS). +* Statistical tests, bootstrap CIs, distribution & shift metrics. +* Feature importance + optional partial dependence. +* CLI producing report + manifest (hashed parameters for reproducibility). +""" from __future__ import annotations @@ -168,44 +139,44 @@ DEFAULT_MODEL_REWARD_PARAMETERS: RewardParams = { } DEFAULT_MODEL_REWARD_PARAMETERS_HELP: Dict[str, str] = { - "invalid_action": "Penalty for invalid actions.", - "base_factor": "Base reward factor used inside the environment.", - "idle_penalty_power": "Power applied to idle penalty scaling.", - "idle_penalty_scale": "Scale of idle penalty.", - "max_idle_duration_candles": "Maximum idle duration candles before full idle penalty scaling.", - "hold_penalty_scale": "Scale of hold penalty.", - "hold_penalty_power": "Power applied to hold penalty scaling.", - "exit_attenuation_mode": "Attenuation kernel (legacy|sqrt|linear|power|half_life).", - "exit_plateau": "Enable plateau. If true, full strength until grace boundary then apply attenuation.", - "exit_plateau_grace": "Grace boundary duration ratio for plateau (full strength until this boundary).", - "exit_linear_slope": "Slope for linear exit attenuation.", - "exit_power_tau": "Tau in (0,1] to derive alpha for power mode.", - "exit_half_life": "Half-life for exponential decay exit mode.", - "efficiency_weight": "Weight for efficiency factor in exit reward.", - "efficiency_center": "Pivot (in [0,1]) for linear efficiency factor; efficiency_ratio above this increases factor, below decreases.", - "win_reward_factor": "Asymptotic bonus multiplier for pnl above target: approaches (1 + win_reward_factor); combined with efficiency_factor the final product can exceed this bound.", - "pnl_factor_beta": "Sensitivity of amplification around target.", - "check_invariants": "Boolean flag (true/false) to enable runtime invariant & safety checks.", - "exit_factor_threshold": "If |exit factor| exceeds this threshold, emit warning.", + "invalid_action": "Penalty for invalid actions", + "base_factor": "Base reward scale", + "idle_penalty_power": "Idle penalty exponent", + "idle_penalty_scale": "Idle penalty scale", + "max_idle_duration_candles": "Idle duration cap (candles)", + "hold_penalty_scale": "Hold penalty scale", + "hold_penalty_power": "Hold penalty exponent", + "exit_attenuation_mode": "Exit kernel (legacy|sqrt|linear|power|half_life)", + "exit_plateau": "Use plateau before attenuation", + "exit_plateau_grace": "Plateau grace duration ratio", + "exit_linear_slope": "Linear kernel slope", + "exit_power_tau": "Tau for power kernel (0,1]", + "exit_half_life": "Half-life for exp kernel", + "efficiency_weight": "Efficiency weight", + "efficiency_center": "Efficiency pivot in [0,1]", + "win_reward_factor": "Profit overshoot bonus factor", + "pnl_factor_beta": "PnL amplification sensitivity", + "check_invariants": "Enable runtime invariant checks", + "exit_factor_threshold": "Warn if |exit_factor| exceeds", # PBRS parameters - "potential_gamma": "Discount factor γ for PBRS potential-based reward shaping (0 ≤ γ ≤ 1).", - "exit_potential_mode": "Exit potential mode: 'canonical' (Φ=0 & additives disabled), 'non-canonical' (Φ=0 & additives allowed), 'progressive_release', 'spike_cancel', 'retain_previous'.", - "exit_potential_decay": "Decay factor for progressive_release exit mode (0 ≤ decay ≤ 1).", - "hold_potential_enabled": "Enable PBRS hold potential function Φ(s).", - "hold_potential_scale": "Scale factor for hold potential function.", - "hold_potential_gain": "Gain factor applied before transforms in hold potential.", - "hold_potential_transform_pnl": "Transform function for PnL in hold potential: tanh, softsign, arctan, sigmoid, asinh, clip.", - "hold_potential_transform_duration": "Transform function for duration ratio in hold potential.", - "entry_additive_enabled": "Enable entry additive reward (non-PBRS component).", - "entry_additive_scale": "Scale factor for entry additive reward.", - "entry_additive_gain": "Gain factor for entry additive reward.", - "entry_additive_transform_pnl": "Transform function for PnL in entry additive (tanh, softsign, arctan, sigmoid, asinh, clip).", - "entry_additive_transform_duration": "Transform function for duration ratio in entry additive.", - "exit_additive_enabled": "Enable exit additive reward (non-PBRS component).", - "exit_additive_scale": "Scale factor for exit additive reward.", - "exit_additive_gain": "Gain factor for exit additive reward.", - "exit_additive_transform_pnl": "Transform function for PnL in exit additive (tanh, softsign, arctan, sigmoid, asinh, clip).", - "exit_additive_transform_duration": "Transform function for duration ratio in exit additive.", + "potential_gamma": "PBRS discount γ (0–1)", + "exit_potential_mode": "Exit potential mode (canonical|non-canonical|progressive_release|spike_cancel|retain_previous)", + "exit_potential_decay": "Decay for progressive_release (0–1)", + "hold_potential_enabled": "Enable hold potential Φ", + "hold_potential_scale": "Hold potential scale", + "hold_potential_gain": "Hold potential gain", + "hold_potential_transform_pnl": "Hold PnL transform", + "hold_potential_transform_duration": "Hold duration transform", + "entry_additive_enabled": "Enable entry additive", + "entry_additive_scale": "Entry additive scale", + "entry_additive_gain": "Entry additive gain", + "entry_additive_transform_pnl": "Entry PnL transform", + "entry_additive_transform_duration": "Entry duration transform", + "exit_additive_enabled": "Enable exit additive", + "exit_additive_scale": "Exit additive scale", + "exit_additive_gain": "Exit additive gain", + "exit_additive_transform_pnl": "Exit PnL transform", + "exit_additive_transform_duration": "Exit duration transform", } @@ -377,7 +348,7 @@ def _is_short_allowed(trading_mode: str) -> bool: # Internal safe fallback helper for numeric failures (centralizes semantics) def _fail_safely(reason: str) -> float: """Return 0.0 on recoverable numeric failure (reason available for future debug hooks).""" - # NOTE: presently silent to preserve legacy behavior; hook logging here if needed. + # Silent fallback; hook logging if diagnostic visibility required. _ = reason return 0.0 @@ -385,28 +356,10 @@ def _fail_safely(reason: str) -> float: def validate_reward_parameters( params: RewardParams, ) -> Tuple[RewardParams, Dict[str, Dict[str, Any]]]: - """Validate and clamp reward parameter values. + """Clamp parameters to bounds and coerce booleans. - This function enforces numeric bounds declared in ``_PARAMETER_BOUNDS``. Values - outside their allowed range are clamped and an entry is recorded in the - ``adjustments`` mapping describing the original value, the adjusted value and the - reason (which bound triggered the change). Non‑finite values are reset to the - minimum bound (or 0.0 if no explicit minimum is defined). - - It does NOT perform schema validation of any DataFrame (legacy text removed). - - Parameters - ---------- - params : dict - Raw user supplied reward parameter overrides (already merged with defaults - upstream). The dict is not mutated in‑place; a sanitized copy is returned. - - Returns - ------- - sanitized_params : dict - Possibly adjusted copy of the provided parameters. - adjustments : dict[str, dict] - Mapping: param -> {original, adjusted, reason} for every modified entry. + Returns sanitized copy plus adjustments mapping (param -> original/adjusted/reason). + Non‑finite numerics fall back to min bound or 0.0. """ sanitized = dict(params) adjustments: Dict[str, Dict[str, Any]] = {} @@ -460,16 +413,7 @@ def validate_reward_parameters( def _normalize_and_validate_mode(params: RewardParams) -> None: - """Align normalization of ``exit_attenuation_mode`` with ReforceXY environment. - - Behavior (mirrors in-env logic): - - Do not force lowercase or strip user formatting; use the value as provided. - - Supported modes (case-sensitive): {legacy, sqrt, linear, power, half_life}. - - If the value is not among supported keys, silently fallback to 'linear' - without emitting a warning (environment side performs a silent fallback). - - If the key is absent or value is ``None``: leave untouched (upstream defaults - will inject 'linear'). - """ + """Validate exit_attenuation_mode; silently fallback to 'linear' if invalid.""" if "exit_attenuation_mode" not in params: return @@ -593,51 +537,7 @@ def _get_exit_factor( duration_ratio: float, params: RewardParams, ) -> float: - """Compute exit factor controlling time attenuation of exit reward. - - Purpose - ------- - Produces a multiplicative factor applied to raw PnL at exit: - exit_reward = pnl * exit_factor - where: - exit_factor = time_kernel(base_factor, effective_duration_ratio) * pnl_factor - - Parity - ------ - Mirrors environment method ``ReforceXY._get_exit_factor`` for offline / synthetic analysis. - - Algorithm - --------- - 1. Validate finiteness & clamp negative duration to 0. - 2. Apply optional plateau: effective_dr = 0 while duration_ratio <= grace else (dr - grace). - 3. Select kernel: legacy | sqrt | linear | power | half_life (all monotonic except legacy discontinuity at dr=1). - 4. Multiply by externally supplied ``pnl_factor`` (profit & efficiency modulation). - 5. Invariants & safety: non-finite -> 0; prevent negative factor on non-negative pnl; warn on large magnitude. - - Parameters - ---------- - base_factor : float - Base scaling constant before temporal attenuation. - pnl : float - Realized (or unrealized at exit decision) profit/loss. - pnl_factor : float - PnL modulation factor (win amplification + efficiency) computed separately. - duration_ratio : float - trade_duration / max_trade_duration (pre-clamped upstream). - params : dict - Reward parameter mapping. - - Returns - ------- - float - Exit factor (>=0 unless pnl < 0 and invariants disabled). - - Notes - ----- - - Legacy kernel is discontinuous and maintained for backward compatibility only. - - Combine with ``_get_pnl_factor`` for full exit reward shaping (non-PBRS, path dependent). - - Plateau introduces a derivative kink at the grace boundary. - """ + """Exit attenuation factor (kernel + optional plateau) * pnl_factor with invariants.""" # Basic finiteness checks if ( not np.isfinite(base_factor) @@ -782,50 +682,7 @@ def _get_pnl_factor( profit_target: float, risk_reward_ratio: float, ) -> float: - """Compute PnL amplification factor (profit target over/under performance + efficiency). - - Purpose - ------- - Encapsulates profit overshoot bonus and controlled loss penalty plus an efficiency tilt - based on intra-trade utilization of observed profit range. - - Parity - ------ - Mirrors environment method ``MyRLEnv._get_pnl_factor``. - - Algorithm - --------- - 1. Compute pnl_ratio = pnl / profit_target (profit_target already includes RR upstream). - 2. If |pnl_ratio| <= 1: pnl_target_factor = 1.0. - 3. Else compute base = tanh(beta * (|pnl_ratio| - 1)). - a. Gain branch (pnl_ratio > 1): 1 + win_reward_factor * base - b. Loss branch (pnl_ratio < -1/rr): 1 + (win_reward_factor * rr) * base - 4. Efficiency: derive efficiency_ratio within [0,1] from intra-trade min/max; add linear tilt - around efficiency_center scaled by efficiency_weight (sign-flipped for losses). - 5. Return max(0, pnl_target_factor * efficiency_factor). - - Parameters - ---------- - params : dict - Reward parameter mapping. - context : RewardContext - Current PnL and intra-trade extrema. - profit_target : float - Profit objective (already RR-scaled when called from calculate_reward). - risk_reward_ratio : float - RR used to set asymmetric loss trigger threshold (pnl_ratio < -1/RR). - - Returns - ------- - float - Non-negative factor (0 if invalid inputs or degenerate target). - - Notes - ----- - - Symmetric tanh avoids unbounded amplification. - - Loss penalty magnitude scaled by RR to keep incentive structure consistent across setups. - - Efficiency tilt introduces path dependence (non-PBRS component). - """ + """PnL factor: tanh overshoot/loss modulation + efficiency tilt (non-negative).""" pnl = context.pnl if ( not np.isfinite(pnl) @@ -1289,11 +1146,7 @@ def simulate_samples( def _validate_simulation_invariants(df: pd.DataFrame) -> None: - """Validate critical algorithmic invariants in simulated data. - - This function ensures mathematical correctness and catches algorithmic bugs. - Failures here indicate fundamental implementation errors that must be fixed. - """ + """Fail fast if simulation violates PnL or action invariants.""" # INVARIANT 1: PnL Conservation - Total PnL must equal sum of exit PnL total_pnl = df["pnl"].sum() exit_action_mask = df["action"].isin([2.0, 4.0]) @@ -1417,37 +1270,7 @@ def _binned_stats( target: str, bins: Iterable[float], ) -> pd.DataFrame: - """Compute aggregated statistics of a target variable across value bins. - - Purpose - ------- - Provide consistent binned descriptive statistics (count, mean, std, min, max) - for exploratory diagnostics of reward component relationships. - - Parameters - ---------- - df : pd.DataFrame - Source dataframe containing at minimum the ``column`` and ``target`` fields. - column : str - Name of the column whose values are used to create bins. - target : str - Column whose statistics are aggregated per bin. - bins : Iterable[float] - Monotonic sequence of bin edges (len >= 2). Values outside the range are - clipped to the boundary edges prior to bin assignment. - - Returns - ------- - pd.DataFrame - Dataframe indexed by stringified interval with columns: - ``count``, ``mean``, ``std``, ``min``, ``max``. - - Notes - ----- - - Duplicate bin edges are dropped via pandas ``cut(duplicates='drop')``. - - Non-finite or missing bins after clipping are excluded prior to grouping. - - This helper is deterministic and side-effect free. - """ + """Return count/mean/std/min/max of target grouped by clipped bins of column.""" bins_arr = np.asarray(list(bins), dtype=float) if bins_arr.ndim != 1 or bins_arr.size < 2: raise ValueError("bins must contain at least two edges") @@ -1471,34 +1294,7 @@ def _binned_stats( def _compute_relationship_stats( df: pd.DataFrame, max_trade_duration: int ) -> Dict[str, Any]: - """Compute binned relationship statistics among core reward drivers. - - Purpose - ------- - Generate uniformly binned summaries for idle duration, trade duration and - realized PnL to facilitate downstream comparative or visual analyses. - - Parameters - ---------- - df : pd.DataFrame - Input dataframe containing ``idle_duration``, ``trade_duration``, ``pnl`` and - corresponding reward component columns (``reward_idle``, ``reward_hold``, - ``reward_exit``). - max_trade_duration : int - Maximum configured trade duration used to scale bin ranges. - - Returns - ------- - Dict[str, Any] - Dictionary with keys ``idle_stats``, ``hold_stats`` and ``exit_stats`` each - containing a binned statistics dataframe. - - Notes - ----- - - PnL bin upper bound is adjusted by a tiny epsilon when min ≈ max to avoid - degenerate intervals. - - All statistics are rounded to 6 decimal places for compactness. - """ + """Return binned stats dict for idle, trade duration and pnl (uniform bins).""" idle_bins = np.linspace(0, max_trade_duration * 3.0, 13) trade_bins = np.linspace(0, max_trade_duration * 3.0, 13) pnl_min = float(df["pnl"].min()) @@ -2185,50 +1981,7 @@ def bootstrap_confidence_intervals( *, strict_diagnostics: bool = False, ) -> Dict[str, Tuple[float, float, float]]: - """Estimate confidence intervals for mean of selected metrics via bootstrap. - - Graceful mode policy (``strict_diagnostics=False``): - - If the computed CI has zero or negative width (including inverted bounds), - it is automatically widened symmetrically around the sample mean by an - epsilon (``INTERNAL_GUARDS['degenerate_ci_epsilon']``) so the reporting - pipeline is not interrupted. A ``RewardDiagnosticsWarning`` is emitted to - make the adjustment transparent. - Strict mode policy (``strict_diagnostics=True``): - - The same anomalous condition triggers an immediate ``AssertionError`` - (fail-fast) to surface upstream causes such as a constant column or a - prior data sanitization issue. - Rationale: This dual mode avoids silently masking structural problems while - still supporting uninterrupted exploratory / CLI smoke runs. - - Purpose - ------- - Provide non-parametric uncertainty estimates for the mean of reward-related - metrics, robust to unknown or asymmetric distributions. - - Parameters - ---------- - df : pd.DataFrame - Source dataframe containing metric columns. - metrics : List[str] - Names of numeric columns to evaluate; silently skipped if absent. - n_bootstrap : int, default 10000 - Number of bootstrap resamples (with replacement). - confidence_level : float, default 0.95 - Nominal coverage probability for the two-sided interval. - seed : int, default 42 - Seed for local reproducible RNG (does not mutate global state). - - Returns - ------- - Dict[str, Tuple[float, float, float]] - Mapping metric -> (mean_estimate, ci_lower, ci_upper). - - Notes - ----- - - Metrics with < 10 non-null observations are skipped to avoid unstable CIs. - - Percentile method used; no bias correction or acceleration applied. - - Validation enforces finite, ordered, positive-width intervals. - """ + """Bootstrap mean CIs (percentile) per metric; skips sparse; adjusts degenerate unless strict.""" alpha = 1 - confidence_level lower_percentile = 100 * alpha / 2 upper_percentile = 100 * (1 - alpha / 2) @@ -2277,26 +2030,7 @@ def bootstrap_confidence_intervals( def _validate_bootstrap_results( results: Dict[str, Tuple[float, float, float]], *, strict_diagnostics: bool ) -> None: - """Validate structural and numerical integrity of bootstrap CI outputs. - - Purpose - ------- - Fail fast if any generated confidence interval violates expected invariants. - - Parameters - ---------- - results : Dict[str, Tuple[float, float, float]] - Mapping from metric name to (mean, ci_low, ci_high). - - Returns - ------- - None - - Notes - ----- - - Raises AssertionError on first violation (finite bounds, ordering, width > 0). - - Intentionally internal; external callers should rely on exceptions for flow. - """ + """Validate each bootstrap CI: finite bounds, ordered, positive width (adjust or raise).""" for metric, (mean, ci_low, ci_high) in results.items(): # CI bounds must be finite if not (np.isfinite(mean) and np.isfinite(ci_low) and np.isfinite(ci_high)): @@ -2345,30 +2079,9 @@ def _validate_bootstrap_results( def distribution_diagnostics( df: pd.DataFrame, *, seed: int | None = None, strict_diagnostics: bool = False ) -> Dict[str, Any]: - """Compute distributional diagnostics for selected numeric columns. - - Purpose - ------- - Aggregate normality test statistics and moment-based shape descriptors to - support assessment of modelling assumptions or transformation needs. - - Parameters - ---------- - df : pd.DataFrame - Dataframe containing relevant numeric columns. - seed : int | None, optional - Reserved for potential future stochastic extensions; unused presently. - - Returns - ------- - Dict[str, Any] - Mapping of column name -> diagnostic results (tests, moments, p-values). + """Return mapping col-> diagnostics (tests, moments, entropy, divergences). - Notes - ----- - - Skips columns absent from the dataframe. - - Applies Shapiro-Wilk for n <= 5000 else D'Agostino's K2 due to cost. - - All numeric outputs are floats; non-finite intermediate results are ignored. + Skips missing columns; selects Shapiro-Wilk when n<=5000 else K2; ignores non-finite intermediates. """ diagnostics = {} _ = seed # placeholder to keep signature for future reproducibility extensions @@ -2499,12 +2212,7 @@ def _validate_distribution_diagnostics( raise AssertionError(f"Q-Q R^2 {key} must be in [0,1], got {value}") -"""PBRS (Potential-Based Reward Shaping) transforms & helpers are defined below. - -They are lifted earlier in the file (before CLI parser / reporting) to keep all -reward computation primitives and shaping logic grouped, reducing cognitive -distance when auditing reward correctness. -""" +"""PBRS (Potential-Based Reward Shaping) transforms & helpers.""" # === PBRS TRANSFORM FUNCTIONS === @@ -2673,18 +2381,7 @@ def _compute_exit_additive( def _compute_exit_potential(last_potential: float, params: RewardParams) -> float: - """Compute next potential Φ(s') for closing/exit transitions. - - Semantics: - - canonical: Φ' = 0.0 (preserves invariance, disables additives) - - non-canonical: Φ' = 0.0 (allows additives, breaks invariance) - - progressive_release: Φ' = Φ * (1 - decay) with decay clamped to [0,1] - - spike_cancel: Φ' = Φ / γ (neutralizes shaping spike ≈ 0 net effect) if γ>0 else Φ - - retain_previous: Φ' = Φ - - Invalid modes fall back to canonical. Any non-finite resulting potential is - coerced to 0.0. - """ + """Exit potential per mode (canonical/non-canonical -> 0; others transform Φ).""" mode = _get_str_param( params, "exit_potential_mode", @@ -2747,11 +2444,7 @@ def apply_potential_shaping( last_potential: float, params: RewardParams, ) -> tuple[float, float, float]: - """ - Apply PBRS potential-based reward shaping following Ng et al. (1999). - - Implements: R'(s,a,s') = R_base(s,a,s') + γΦ(s') - Φ(s) - """ + """Compute shaped reward: base + γΦ' - Φ plus (entry/exit) additives (if enabled).""" params = _enforce_pbrs_invariance(params) gamma = _get_potential_gamma(params) current_potential = _compute_hold_potential( @@ -2779,7 +2472,7 @@ def apply_potential_shaping( def _enforce_pbrs_invariance(params: RewardParams) -> RewardParams: - """Enforce PBRS invariance by auto-disabling additives in canonical mode.""" + """Disable entry/exit additives once in canonical PBRS to preserve invariance.""" mode = _get_str_param( params, "exit_potential_mode", @@ -2846,7 +2539,7 @@ def build_argument_parser() -> argparse.ArgumentParser: description="Synthetic stress-test of the ReforceXY reward shaping logic." ) parser.add_argument( - "--skip-feature-analysis", + "--skip_feature-analysis", action="store_true", help="Skip feature importance and model-based analysis for all scenarios.", ) @@ -3400,7 +3093,7 @@ def write_complete_statistical_analysis( if skip_feature_analysis or len(df) < 4: reason = [] if skip_feature_analysis: - reason.append("flag --skip-feature-analysis set") + reason.append("flag --skip_feature-analysis set") if len(df) < 4: reason.append("insufficient samples <4") reason_str = "; ".join(reason) if reason else "skipped" diff --git a/ReforceXY/reward_space_analysis/test_cli.py b/ReforceXY/reward_space_analysis/test_cli.py index 68bb555..d1947c9 100644 --- a/ReforceXY/reward_space_analysis/test_cli.py +++ b/ReforceXY/reward_space_analysis/test_cli.py @@ -164,7 +164,7 @@ def run_scenario( # Forward bootstrap resamples explicitly cmd += ["--bootstrap_resamples", str(bootstrap_resamples)] if skip_feature_analysis: - cmd.append("--skip-feature-analysis") + cmd.append("--skip_feature-analysis") if strict: cmd.append("--strict_diagnostics") start = time.perf_counter() @@ -206,7 +206,7 @@ def main(): help="num synthetic samples per scenario (minimum 4 for feature analysis)", ) parser.add_argument( - "--skip-feature-analysis", + "--skip_feature-analysis", action="store_true", help="Skip feature importance and model-based analysis for all scenarios.", ) @@ -257,7 +257,7 @@ def main(): if args.max_scenarios <= 0: parser.error("--max-scenarios must be > 0") if args.samples < 4 and not args.skip_feature_analysis: - parser.error("--samples must be >= 4 unless --skip-feature-analysis is set") + parser.error("--samples must be >= 4 unless --skip_feature-analysis is set") if args.strict_sample < 0: parser.error("--strict-sample must be >= 0") if args.bootstrap_resamples <= 0: diff --git a/ReforceXY/reward_space_analysis/test_reward_space_analysis.py b/ReforceXY/reward_space_analysis/test_reward_space_analysis.py index f7b12ea..b967e28 100644 --- a/ReforceXY/reward_space_analysis/test_reward_space_analysis.py +++ b/ReforceXY/reward_space_analysis/test_reward_space_analysis.py @@ -91,7 +91,6 @@ class RewardSpaceTestBase(unittest.TestCase): def setUp(self): """Set up test fixtures with reproducible random seed.""" - # Unified seeding for numpy + random self.seed_all(self.SEED) self.temp_dir = tempfile.mkdtemp() self.output_path = Path(self.temp_dir) -- 2.43.0