]> Piment Noir Git Repositories - freqai-strategies.git/commitdiff
refactor(reforcexy): improve synthetic data generation and validation
authorJérôme Benoit <jerome.benoit@piment-noir.org>
Sat, 4 Oct 2025 22:24:46 +0000 (00:24 +0200)
committerJérôme Benoit <jerome.benoit@piment-noir.org>
Sat, 4 Oct 2025 22:24:46 +0000 (00:24 +0200)
Signed-off-by: Jérôme Benoit <jerome.benoit@piment-noir.org>
ReforceXY/reward_space_analysis/README.md
ReforceXY/reward_space_analysis/pyproject.toml [new file with mode: 0644]
ReforceXY/reward_space_analysis/reward_space_analysis.py
ReforceXY/reward_space_analysis/test_reward_alignment.py [deleted file]
ReforceXY/reward_space_analysis/test_reward_space_analysis.py [new file with mode: 0644]
ReforceXY/reward_space_analysis/test_stat_coherence.py [deleted file]

index aca255ea191daae7328c75f7bc5abb6c3aec8c14..80fb4dc48ef510051f376a86c902e5297850b92d 100644 (file)
@@ -1,6 +1,6 @@
 # 📊 Reward Space Analysis - User Guide
 
-**Analyze and validate ReforceXY reward logic with synthetic data**
+**Analyze and validate ReforceXY reward logic with synthetic data (with built‑in runtime statistical & invariant validations)**
 
 ---
 
@@ -10,15 +10,16 @@ This tool helps you understand and validate how the ReforceXY reinforcement lear
 
 ### Key Features
 
-- ✅ Generate thousands of trading scenarios instantly
-- ✅ Analyze reward distribution and patterns
-- ✅ Validate reward logic against expected behavior
-- ✅ Export results for further analysis
-- ✅ Compare synthetic vs real trading data
+- ✅ Generate thousands of synthetic trading scenarios deterministically
+- ✅ Analyze reward distribution, feature importance & partial dependence
+- ✅ Built‑in invariant & statistical validation layers (fail‑fast)
+- ✅ Export reproducible artifacts (hash + manifest)
+- ✅ Compare synthetic vs real trading data (distribution shift metrics)
+- ✅ Parameter bounds & automatic sanitization
 
 ---
 
-**New to this tool?** Start with [Common Use Cases](#-common-use-cases) to see practical examples, then explore [CLI Parameters](#️-cli-parameters-reference) for detailed configuration options.
+**New to this tool?** Start with [Common Use Cases](#-common-use-cases) then explore [CLI Parameters](#️-cli-parameters-reference). For runtime guardrails see [Validation Layers](#-validation-layers-runtime).
 
 ---
 
@@ -47,7 +48,7 @@ Whenever you need to run analyses or tests, activate the environment first:
 ```shell
 source .venv/bin/activate
 python reward_space_analysis.py --num_samples 20000 --output demo
-python test_reward_alignment.py
+python test_reward_space_analysis.py
 ```
 
 > Deactivate the environment with `deactivate` when you're done.
@@ -170,6 +171,17 @@ None - all parameters have sensible defaults.
 - Multiple of max_trade_duration used for sampling trade/idle durations
 - Higher = more variety in duration scenarios
 
+### PnL / Volatility Controls
+
+These parameters shape the synthetic PnL generation process and heteroscedasticity (variance increasing with duration):
+
+**`--pnl_base_std`** (float, default: 0.02)
+- Base standard deviation (volatility floor) for generated PnL before duration scaling.
+
+**`--pnl_duration_vol_scale`** (float, default: 0.5)
+- Multiplicative scaling of additional volatility proportional to (trade_duration / max_trade_duration).
+- Higher values = stronger heteroscedastic effect.
+
 ### Trading Environment
 
 **`--trading_mode`** (choice: spot|margin|futures, default: spot)
@@ -209,7 +221,7 @@ _Idle penalty configuration:_
 
 _Holding penalty configuration:_
 
-- `holding_duration_ratio_grace` (default: 1.0) - Grace ratio (≤1) before holding penalty increases with duration ratio
+- `holding_duration_ratio_grace` (default: 1.0) - Grace region before holding penalty increases with duration ratio
 - `holding_penalty_scale` (default: 0.3) - Scale of holding penalty
 - `holding_penalty_power` (default: 1.0) - Power applied to holding penalty scaling
 
@@ -238,8 +250,52 @@ _Profit factor configuration:_
 - Enables distribution shift analysis (KL divergence, JS distance, Wasserstein distance)
 - Example: `../user_data/models/ReforceXY-PPO/sub_train_SYMBOL_DATE/episode_rewards.pkl`
 
+**`--pvalue_adjust`** (choice: none|benjamini_hochberg, default: none)
+
+- Apply multiple hypothesis testing correction (Benjamini–Hochberg) to p-values in statistical hypothesis tests.
+- When set to `benjamini_hochberg`, adjusted p-values and adjusted significance flags are added to the reports.
+
+**`--stats_seed`** (int, optional; default: inherit `--seed`)
+
+- Dedicated seed for statistical analyses (hypothesis tests, bootstrap confidence intervals, distribution diagnostics).
+- Use this if you want to generate multiple independent statistical analyses over the same synthetic dataset without re-simulating samples.
+- If omitted, falls back to `--seed` for full run determinism.
+
+### Reproducibility Model
+
+| Component | Controlled By | Notes |
+|-----------|---------------|-------|
+| Sample simulation | `--seed` | Drives action sampling, PnL noise, force actions. |
+| Statistical tests / bootstrap | `--stats_seed` (fallback `--seed`) | Local RNG; isolation prevents side‑effects in user code. |
+| RandomForest & permutation importance | `--seed` | Ensures identical splits and tree construction. |
+| Partial dependence grids | Deterministic | Depends only on fitted model & data. |
+
+Common patterns:
+```shell
+# Same synthetic data, two different statistical re-analysis runs
+python reward_space_analysis.py --num_samples 50000 --seed 123 --stats_seed 9001 --output run_stats1
+python reward_space_analysis.py --num_samples 50000 --seed 123 --stats_seed 9002 --output run_stats2
+
+# Fully reproducible end-to-end (all aspects deterministic)
+python reward_space_analysis.py --num_samples 50000 --seed 777
+```
+
 ---
 
+#### Direct Tunable Overrides vs `--params`
+
+Every key in `DEFAULT_MODEL_REWARD_PARAMETERS` is also exposed as an individual CLI flag (dynamic generation). You may choose either style:
+
+```shell
+# Direct flag style (dynamic CLI args)
+python reward_space_analysis.py --win_reward_factor 3.0 --idle_penalty_scale 2.0 --num_samples 15000
+
+# Equivalent using --params
+python reward_space_analysis.py --params win_reward_factor=3.0 idle_penalty_scale=2.0 --num_samples 15000
+```
+
+If both a direct flag and the same key in `--params` are provided, the `--params` value takes highest precedence.
+
 ## 📝 Example Commands
 
 ```shell
@@ -280,8 +336,11 @@ The analysis generates the following output files:
 - **Sample Representativity** - Coverage of critical market scenarios
 - **Component Analysis** - Relationships between rewards and conditions
 - **Feature Importance** - Machine learning analysis of key drivers
-- **Statistical Validation** - Hypothesis tests, confidence intervals, normality diagnostics
-- **Distribution Shift** (optional) - Comparison with real trading data
+- **Statistical Validation** - Hypothesis tests, confidence intervals, normality + effect sizes
+- **Distribution Shift** - Real vs synthetic divergence (KL, JS, Wasserstein, KS)
+- **Diagnostics Validation Summary**   
+  - Pass/fail snapshot of all runtime checks
+  - Consolidated pass/fail state of every validation layer (invariants,     parameter bounds, bootstrap CIs, distribution metrics, diagnostics, hypothesis tests)
 
 ### Data Exports
 
@@ -290,6 +349,25 @@ The analysis generates the following output files:
 | `reward_samples.csv`       | Raw synthetic samples for custom analysis            |
 | `feature_importance.csv`   | Feature importance rankings from random forest model |
 | `partial_dependence_*.csv` | Partial dependence data for key features             |
+| `manifest.json`            | Run metadata (seed, params, top features, overrides) |
+
+### Manifest Structure (`manifest.json`)
+
+Key fields:
+
+| Field | Description |
+|-------|-------------|
+| `generated_at` | ISO timestamp of run |
+| `num_samples` | Number of synthetic samples generated |
+| `seed` | Random seed used (deterministic cascade) |
+| `profit_target_effective` | Profit target after risk/reward scaling |
+| `top_features` | Top 5 features by permutation importance |
+| `reward_param_overrides` | Subset of reward tunables whose values differ from defaults |
+| `params_hash` | SHA-256 hash combining simulation params + overrides (reproducibility) |
+| `params` | Echo of core simulation parameters (subset, for quick audit) |
+| `parameter_adjustments` | Any automatic bound clamps applied by `validate_reward_parameters` |
+
+Use `params_hash` to verify reproducibility across runs; identical seeds + identical overrides ⇒ identical hash.
 
 ---
 
@@ -328,7 +406,7 @@ python reward_space_analysis.py \
     --output real_vs_synthetic
 ```
 
-The report will include distribution shift metrics (KL divergence, JS distance, Wasserstein distance) showing how well synthetic samples represent real trading.
+The report will include distribution shift metrics (KL divergence ≥ 0, JS distance ∈ [0,1], Wasserstein ≥ 0, KS statistic ∈ [0,1], KS p‑value ∈ [0,1]) showing how well synthetic samples represent real trading. Degenerate (constant) distributions are auto‑detected and produce zero divergence and KS p‑value = 1.0 to avoid spurious instability.
 
 ### Batch Analysis
 
@@ -346,32 +424,85 @@ done
 
 ## 🧪 Validation & Testing
 
-### Run Regression Tests
+### Run Tests
 
 ```shell
-python test_reward_alignment.py
+python test_reward_space_analysis.py
 ```
 
-**Expected output:**
+The unified suite currently contains 32 focused tests (coverage ~84%). Example (abridged) successful run shows all test_* cases passing (see file for full list). Number may increase as validations expand.
 
-```
-✅ ENUMS_MATCH: True
-✅ DEFAULT_PARAMS_MATCH: True
-✅ TEST_INVALID_ACTION: PASS
-✅ TEST_IDLE_PENALTY: PASS
-✅ TEST_HOLDING_PENALTY: PASS
-✅ TEST_EXIT_REWARD: PASS
-```
+### Test Categories
+
+| Category | Class | Focus |
+|----------|-------|-------|
+| Integration | TestIntegration | CLI, artifacts, manifest reproducibility |
+| Statistical Coherence | TestStatisticalCoherence | Distribution shift, diagnostics, hypothesis basics |
+| Reward Alignment | TestRewardAlignment | Component correctness & exit factors |
+| Public API | TestPublicFunctions | Parsing, bootstrap, reproducibility seeds |
+| Edge Cases | TestEdgeCases | Extreme params & exit modes diversity |
+| Utility | TestUtilityFunctions | I/O writers, model analysis, helper conversions |
+| Private via Public | TestPrivateFunctionsViaPublicAPI | Penalties & exit reward paths |
+
+### Test Architecture
+
+- **Single test file**: `test_reward_space_analysis.py` (consolidates all testing)
+- **Base class**: `RewardSpaceTestBase` with shared configuration and utilities
+- **Unified framework**: `unittest` with optional `pytest` configuration
+- **Reproducible**: Fixed seed (`SEED = 42`) for consistent results
+
+### Code Coverage Analysis
+
+**Current Coverage: ~84%**
+
+To analyze code coverage in detail:
 
 ```shell
-python test_stat_coherence.py
+# Install coverage tool (if not already installed)
+pip install coverage
+
+# Run tests with coverage
+coverage run --source=. test_reward_space_analysis.py
+
+# Generate coverage report
+coverage report -m
+
+# Generate HTML report for detailed analysis
+coverage html
+# View htmlcov/index.html in browser
 ```
 
+**Coverage Focus Areas:**
+- ✅ **Core reward calculation logic** - Excellently covered (>95%)
+- ✅ **Statistical functions** - Comprehensively covered (>90%)
+- ✅ **Public API functions** - Thoroughly covered (>85%)
+- ✅ **Report generation functions** - Well covered via dedicated tests
+- ✅ **Utility functions** - Well covered via simulation tests
+- ⚠️ **CLI interface and main()** - Partially covered (sufficient via integration tests)
+- ⚠️ **Error handling paths** - Basic coverage (acceptable for robustness)
+
 ### When to Run Tests
 
 - After modifying reward logic
 - Before important analyses
 - When results seem unexpected
+- After updating dependencies or Python version
+- When contributing new features (aim for >80% coverage on new code)
+
+### Run Specific Test Categories
+
+```shell
+# All tests (recommended)
+python test_reward_space_analysis.py
+
+# Individual test classes using unittest
+python -m unittest test_reward_space_analysis.TestIntegration
+python -m unittest test_reward_space_analysis.TestStatisticalCoherence
+python -m unittest test_reward_space_analysis.TestRewardAlignment
+
+# With pytest (if installed)
+pytest test_reward_space_analysis.py -v
+```
 
 ---
 
@@ -403,7 +534,7 @@ pip install pandas numpy scipy scikit-learn
 
 **Solution:**
 
-- Run `test_reward_alignment.py` to validate logic
+- Run `test_reward_space_analysis.py` to validate logic
 - Review parameter overrides with `--params`
 - Check trading mode settings (spot vs margin/futures)
 - Verify `base_factor` matches your environment config
@@ -447,8 +578,7 @@ pip install pandas numpy scipy scikit-learn
 python reward_space_analysis.py --num_samples 20000 --output my_analysis
 
 # Run validation tests
-python test_reward_alignment.py
-python test_stat_coherence.py
+python test_reward_space_analysis.py
 ```
 
 ### Best Practices
@@ -457,7 +587,7 @@ python test_stat_coherence.py
 
 - Start with 10,000-20,000 samples for quick iteration
 - Use default parameters initially
-- Always run tests after modifying reward logic
+- Always run tests after modifying reward logic: `python test_reward_space_analysis.py`
 - Review `statistical_analysis.md` for insights
 
 **For Advanced Users:**
@@ -482,6 +612,62 @@ For detailed troubleshooting, see [Troubleshooting](#-troubleshooting) section.
 | ------------------ | ------------------------------------------------------------- |
 | Memory errors      | Reduce `--num_samples` to 10,000-20,000                       |
 | Slow execution     | Use `--trading_mode spot` or reduce samples                   |
-| Unexpected rewards | Run `test_reward_alignment.py` and check `--params` overrides |
+| Unexpected rewards | Run `test_reward_space_analysis.py` and check `--params` overrides |
 | Import errors      | Activate venv: `source .venv/bin/activate`                    |
 | No output files    | Check write permissions and disk space                        |
+| Hash mismatch      | Confirm overrides + seed; compare `reward_param_overrides`    |
+
+### Validation Layers (Runtime)
+
+All runs execute a sequence of fail‑fast validations; a failure aborts with a clear message:
+
+| Layer | Scope | Guarantees |
+|-------|-------|------------|
+| Simulation Invariants | Raw synthetic samples | PnL only on exit actions; sum PnL equals exit PnL; no exit reward without PnL. |
+| Parameter Bounds | Tunables | Clamps values outside declared bounds; records adjustments in manifest. |
+| Bootstrap CIs | Mean estimates | Finite means; ordered CI bounds; non‑NaN across metrics. |
+| Distribution Metrics | Real vs synthetic shifts | Metrics within mathematical bounds (KL ≥0, JS ∈[0,1], Wasserstein ≥0, KS stats/p ≤[0,1]). Degenerate distributions handled safely (zeroed metrics). |
+| Distribution Diagnostics | Normality & moments | Finite mean/std/skew/kurtosis; Shapiro p-value ∈[0,1]; variance non-negative. |
+| Hypothesis Tests | Test result dicts | p-values & effect sizes within valid ranges; optional multiple-testing adjustment (Benjamini–Hochberg). |
+
+### Statistical Method Notes
+
+- Bootstrap CIs: percentile method (default 10k resamples in full runs; tests may use fewer). BCa not yet implemented (explicitly deferred).
+- Multiple testing: Benjamini–Hochberg available via `--pvalue_adjust benjamini_hochberg`.
+- JS distance reported as the square root of Jensen–Shannon divergence (hence bounded by 1).
+- Degenerate distributions (all values identical) short‑circuit to stable zero metrics.
+- Random Forest: 400 trees, `n_jobs=1` for determinism.
+- Heteroscedasticity model: σ = `pnl_base_std * (1 + pnl_duration_vol_scale * duration_ratio)`.
+
+### Parameter Validation & Sanitization
+
+Before simulation (early in `main()`), `validate_reward_parameters` enforces numeric bounds (see `_PARAMETER_BOUNDS` in code). Adjusted values are:
+
+1. Clamped to min/max if out of range.
+2. Reset to min if non-finite.
+3. Recorded in `manifest.json` under `parameter_adjustments` with original and adjusted values. Each entry also contains `_reason_text` (comma‑separated clamp reasons: e.g. `min=0.0`, `max=1.0`, `non_finite_reset`).
+
+Design intent: maintain a single canonical defaults map + explicit bounds; no silent acceptance of pathological inputs.
+
+#### Parameter Bounds Summary
+
+| Parameter | Min | Max | Notes |
+|-----------|-----|-----|-------|
+| `invalid_action` | — | 0.0 | Must be ≤ 0 (penalty) |
+| `base_factor` | 0.0 | — | Global scaling factor |
+| `idle_penalty_power` | 0.0 | — | Power exponent ≥ 0 |
+| `idle_penalty_scale` | 0.0 | — | Scale ≥ 0 |
+| `holding_duration_ratio_grace` | 0.0 | - | Fraction of max duration |
+| `holding_penalty_scale` | 0.0 | — | Scale ≥ 0 |
+| `holding_penalty_power` | 0.0 | — | Power exponent ≥ 0 |
+| `exit_linear_slope` | 0.0 | — | Slope ≥ 0 |
+| `exit_piecewise_grace` | 0.0 | - | Fraction of max duration |
+| `exit_piecewise_slope` | 0.0 | — | Slope ≥ 0 |
+| `exit_power_tau` | 1e-6 | 1.0 | Mapped to alpha = -ln(tau) |
+| `exit_half_life` | 1e-6 | — | Half-life in duration ratio units |
+| `efficiency_weight` | 0.0 | 2.0 | Blend weight |
+| `efficiency_center` | 0.0 | 1.0 | Sigmoid center |
+| `win_reward_factor` | 0.0 | — | Amplification ≥ 0 |
+| `pnl_factor_beta` | 1e-6 | — | Sensitivity ≥ tiny positive |
+
+Non-finite inputs are reset to the applicable minimum (or 0.0 if only a maximum is declared) and logged as adjustments.
diff --git a/ReforceXY/reward_space_analysis/pyproject.toml b/ReforceXY/reward_space_analysis/pyproject.toml
new file mode 100644 (file)
index 0000000..e0f4472
--- /dev/null
@@ -0,0 +1,45 @@
+[tool.pytest.ini_options]
+# Pytest configuration following GitHub Copilot instructions:
+# - Single source of truth for test configuration
+# - Reproducible test execution
+# - Clear error reporting
+
+minversion = "6.0"
+testpaths = [
+    "."
+]
+python_files = [
+    "test_*.py"
+]
+python_classes = [
+    "Test*"
+]
+python_functions = [
+    "test_*"
+]
+
+# Logging configuration
+log_cli = true
+log_cli_level = "INFO"
+log_cli_format = "%(asctime)s [%(levelname)8s] %(name)s: %(message)s"
+log_cli_date_format = "%Y-%m-%d %H:%M:%S"
+
+# Coverage configuration
+addopts = [
+    "--verbose",
+    "--tb=short",
+    "--strict-markers",
+    "--disable-warnings",
+    "--color=yes"
+]
+
+# Test markers
+markers = [
+    "integration: Integration tests requiring external dependencies",
+    "unit: Fast unit tests",
+    "statistical: Statistical validation tests",
+    "slow: Tests that take more than a few seconds"
+]
+
+# Minimum Python version
+python_version = "3.8"
index 03463a8a475a1ab6c66ed92f65101557114139f0..d3d21e7f218d261371e8a2243c22fcd6e172d2b2 100644 (file)
@@ -1,14 +1,19 @@
 #!/usr/bin/env python3
 """Synthetic reward space analysis for the ReforceXY environment.
 
-Includes comprehensive statistical validation methods:
-- Hypothesis testing (Spearman, Kruskal-Wallis, Mann-Whitney)
-- Bootstrap confidence intervals
-- Distribution diagnostics
-- Distribution shift metrics (KL, JS, Wasserstein)
-- Feature importance analysis
-
-All statistical tests are included by default in the analysis report.
+Capabilities:
+- Hypothesis testing (Spearman, Kruskal-Wallis, Mann-Whitney).
+- Percentile bootstrap confidence intervals (BCa not yet implemented).
+- Distribution diagnostics (Shapiro, Anderson, skewness, kurtosis, Q-Q R²).
+- Distribution shift metrics (KL divergence, JS distance, Wasserstein, KS test) with
+    degenerate (constant) distribution safe‑guards.
+- Unified RandomForest feature importance + partial dependence.
+- Heteroscedastic PnL simulation (variance scales with duration).
+
+Architecture principles:
+- Single source of truth: `DEFAULT_MODEL_REWARD_PARAMETERS` for tunables + dynamic CLI.
+- Determinism: explicit seeding, parameter hashing for manifest traceability.
+- Extensibility: modular helpers (sampling, reward calculation, statistics, reporting).
 """
 
 from __future__ import annotations
@@ -123,6 +128,75 @@ DEFAULT_MODEL_REWARD_PARAMETERS_HELP: Dict[str, str] = {
 }
 
 
+# ---------------------------------------------------------------------------
+# Parameter validation utilities
+# ---------------------------------------------------------------------------
+
+_PARAMETER_BOUNDS: Dict[str, Dict[str, float]] = {
+    # key: {min: ..., max: ...}  (bounds are inclusive where it makes sense)
+    "invalid_action": {"max": 0.0},  # penalty should be <= 0
+    "base_factor": {"min": 0.0},
+    "idle_penalty_power": {"min": 0.0},
+    "idle_penalty_scale": {"min": 0.0},
+    "holding_duration_ratio_grace": {"min": 0.0},
+    "holding_penalty_scale": {"min": 0.0},
+    "holding_penalty_power": {"min": 0.0},
+    "exit_linear_slope": {"min": 0.0},
+    "exit_piecewise_grace": {"min": 0.0},
+    "exit_piecewise_slope": {"min": 0.0},
+    "exit_power_tau": {"min": 1e-6, "max": 1.0},  # open (0,1] approximated
+    "exit_half_life": {"min": 1e-6},
+    "efficiency_weight": {"min": 0.0, "max": 2.0},
+    "efficiency_center": {"min": 0.0, "max": 1.0},
+    "win_reward_factor": {"min": 0.0},
+    "pnl_factor_beta": {"min": 1e-6},
+}
+
+
+def validate_reward_parameters(
+    params: Dict[str, float | str],
+) -> Tuple[Dict[str, float | str], Dict[str, Dict[str, float]]]:
+    """Validate and clamp reward parameter values.
+
+    Returns
+    -------
+    sanitized_params : dict
+        Potentially adjusted copy of input params.
+    adjustments : dict
+        Mapping param -> {original, adjusted, reason} for every modification.
+    """
+    sanitized = dict(params)
+    adjustments: Dict[str, Dict[str, float]] = {}
+    for key, bounds in _PARAMETER_BOUNDS.items():
+        if key not in sanitized:
+            continue
+        value = sanitized[key]
+        if not isinstance(value, (int, float)):
+            continue
+        original = float(value)
+        adjusted = original
+        reason_parts: List[str] = []
+        if "min" in bounds and adjusted < bounds["min"]:
+            adjusted = bounds["min"]
+            reason_parts.append(f"min={bounds['min']}")
+        if "max" in bounds and adjusted > bounds["max"]:
+            adjusted = bounds["max"]
+            reason_parts.append(f"max={bounds['max']}")
+        if not math.isfinite(adjusted):
+            adjusted = bounds.get("min", 0.0)
+            reason_parts.append("non_finite_reset")
+        if not math.isclose(adjusted, original):
+            sanitized[key] = adjusted
+            adjustments[key] = {
+                "original": original,
+                "adjusted": adjusted,
+                "reason": float("nan"),  # placeholder for JSON compatibility
+            }
+            # store textual reason separately to avoid type mixing
+            adjustments[key]["_reason_text"] = ",".join(reason_parts)
+    return sanitized, adjustments
+
+
 def add_tunable_cli_args(parser: argparse.ArgumentParser) -> None:
     """Dynamically add CLI options for each tunable in DEFAULT_MODEL_REWARD_PARAMETERS.
 
@@ -191,6 +265,14 @@ def _get_exit_factor(
     This mirrors ReforceXY._get_exit_factor() exactly:
     1. Apply time-based attenuation to base factor
     2. Multiply by pnl_factor at the end
+
+    Exit factor mode formulas (explicit clarification for auditability):
+    - legacy: factor *= 1.5 if duration_ratio <= 1 else *= 0.5
+    - sqrt:   factor /= sqrt(1 + duration_ratio)
+    - linear: factor /= (1 + slope * duration_ratio)
+    - power:  alpha = -log(tau)/log(2) with tau in (0,1]; factor /= (1 + duration_ratio)^alpha
+    - piecewise: grace region g in [0,1]; factor /= 1 if d<=g else (1 + slope*(d-g))
+    - half_life: factor *= 2^(-duration_ratio/half_life)
     """
     if (
         not math.isfinite(factor)
@@ -542,6 +624,8 @@ def simulate_samples(
     risk_reward_ratio: float,
     holding_max_ratio: float,
     trading_mode: str,
+    pnl_base_std: float,
+    pnl_duration_vol_scale: float,
 ) -> pd.DataFrame:
     rng = random.Random(seed)
     short_allowed = _is_short_allowed(trading_mode)
@@ -583,26 +667,32 @@ def simulate_samples(
             trade_duration = max(1, trade_duration)
             idle_duration = 0
 
-        # Generate PnL with realistic correlation to force_action and position
-        pnl = rng.gauss(0.0, 0.02)
-
-        # Apply directional bias for positions
-        duration_factor = trade_duration / max(1, max_trade_duration)
-        if position == Positions.Long:
-            pnl += 0.005 * duration_factor
-        elif position == Positions.Short:
-            pnl -= 0.005 * duration_factor
-
-        # Force actions should correlate with PnL sign
-        if force_action == ForceActions.Take_profit:
-            # Take profit exits should have positive PnL
-            pnl = abs(pnl) + rng.uniform(0.01, 0.05)
-        elif force_action == ForceActions.Stop_loss:
-            # Stop loss exits should have negative PnL
-            pnl = -abs(pnl) - rng.uniform(0.01, 0.05)
-
-        # Clip PnL to realistic range
-        pnl = max(min(pnl, 0.15), -0.15)
+        # Only exit actions should have non-zero PnL
+        pnl = 0.0  # Initialize as zero for all actions
+
+        # Generate PnL only for exit actions (Long_exit=2, Short_exit=4)
+        if action in (Actions.Long_exit, Actions.Short_exit):
+            # Apply directional bias for positions
+            duration_factor = trade_duration / max(1, max_trade_duration)
+
+            # PnL variance scales with duration for more realistic heteroscedasticity
+            pnl_std = pnl_base_std * (1.0 + pnl_duration_vol_scale * duration_factor)
+            pnl = rng.gauss(0.0, pnl_std)
+            if position == Positions.Long:
+                pnl += 0.005 * duration_factor
+            elif position == Positions.Short:
+                pnl -= 0.005 * duration_factor
+
+            # Force actions should correlate with PnL sign
+            if force_action == ForceActions.Take_profit:
+                # Take profit exits should have positive PnL
+                pnl = abs(pnl) + rng.uniform(0.01, 0.05)
+            elif force_action == ForceActions.Stop_loss:
+                # Stop loss exits should have negative PnL
+                pnl = -abs(pnl) - rng.uniform(0.01, 0.05)
+
+            # Clip PnL to realistic range
+            pnl = max(min(pnl, 0.15), -0.15)
 
         if position == Positions.Neutral:
             max_unrealized_profit = 0.0
@@ -659,7 +749,87 @@ def simulate_samples(
             }
         )
 
-    return pd.DataFrame(samples)
+    df = pd.DataFrame(samples)
+
+    # Validate critical algorithmic invariants
+    _validate_simulation_invariants(df)
+
+    return df
+
+
+def _validate_simulation_invariants(df: pd.DataFrame) -> None:
+    """Validate critical algorithmic invariants in simulated data.
+
+    This function ensures mathematical correctness and catches algorithmic bugs.
+    Failures here indicate fundamental implementation errors that must be fixed.
+    """
+    # INVARIANT 1: PnL Conservation - Total PnL must equal sum of exit PnL
+    total_pnl = df["pnl"].sum()
+    exit_mask = df["reward_exit"] != 0
+    exit_pnl_sum = df.loc[exit_mask, "pnl"].sum()
+
+    pnl_diff = abs(total_pnl - exit_pnl_sum)
+    if pnl_diff > 1e-10:
+        raise AssertionError(
+            f"PnL INVARIANT VIOLATION: Total PnL ({total_pnl:.6f}) != "
+            f"Exit PnL sum ({exit_pnl_sum:.6f}), difference = {pnl_diff:.2e}"
+        )
+
+    # INVARIANT 2: PnL Exclusivity - Only exit actions should have non-zero PnL
+    non_zero_pnl_actions = set(df[df["pnl"] != 0]["action"].unique())
+    valid_exit_actions = {2.0, 4.0}  # Long_exit, Short_exit
+
+    invalid_actions = non_zero_pnl_actions - valid_exit_actions
+    if invalid_actions:
+        raise AssertionError(
+            f"PnL EXCLUSIVITY VIOLATION: Non-exit actions {invalid_actions} have non-zero PnL"
+        )
+
+    # INVARIANT 3: Exit Reward Consistency - Non-zero exit rewards require non-zero PnL
+    inconsistent_exits = df[(df["pnl"] == 0) & (df["reward_exit"] != 0)]
+    if len(inconsistent_exits) > 0:
+        raise AssertionError(
+            f"EXIT REWARD INCONSISTENCY: {len(inconsistent_exits)} actions have "
+            f"zero PnL but non-zero exit reward"
+        )
+
+    # INVARIANT 4: Action-Position Compatibility
+    # Validate that exit actions match positions
+    long_exits = df[
+        (df["action"] == 2.0) & (df["position"] != 1.0)
+    ]  # Long_exit but not Long position
+    short_exits = df[
+        (df["action"] == 4.0) & (df["position"] != 0.0)
+    ]  # Short_exit but not Short position
+
+    if len(long_exits) > 0:
+        raise AssertionError(
+            f"ACTION-POSITION INCONSISTENCY: {len(long_exits)} Long_exit actions "
+            f"without Long position"
+        )
+
+    if len(short_exits) > 0:
+        raise AssertionError(
+            f"ACTION-POSITION INCONSISTENCY: {len(short_exits)} Short_exit actions "
+            f"without Short position"
+        )
+
+    # INVARIANT 5: Duration Logic - Neutral positions should have trade_duration = 0
+    neutral_with_trade = df[(df["position"] == 0.5) & (df["trade_duration"] > 0)]
+    if len(neutral_with_trade) > 0:
+        raise AssertionError(
+            f"DURATION LOGIC VIOLATION: {len(neutral_with_trade)} Neutral positions "
+            f"with non-zero trade_duration"
+        )
+
+    # INVARIANT 6: Bounded Values - Check realistic bounds
+    extreme_pnl = df[(df["pnl"].abs() > 0.2)]  # Beyond reasonable range
+    if len(extreme_pnl) > 0:
+        max_abs_pnl = df["pnl"].abs().max()
+        raise AssertionError(
+            f"BOUNDS VIOLATION: {len(extreme_pnl)} samples with extreme PnL, "
+            f"max |PnL| = {max_abs_pnl:.6f}"
+        )
 
 
 def _compute_summary_stats(df: pd.DataFrame) -> Dict[str, Any]:
@@ -833,9 +1003,12 @@ def write_relationship_reports(
 
 
 def _compute_representativity_stats(
-    df: pd.DataFrame, profit_target: float
+    df: pd.DataFrame, profit_target: float, max_trade_duration: int | None = None
 ) -> Dict[str, Any]:
-    """Compute representativity statistics for the reward space."""
+    """Compute representativity statistics for the reward space.
+
+    NOTE: The max_trade_duration parameter is reserved for future duration coverage metrics.
+    """
     total = len(df)
     # Map numeric position codes to readable labels to avoid casting Neutral (0.5) to 0
     pos_label_map = {0.0: "Short", 0.5: "Neutral", 1.0: "Long"}
@@ -885,7 +1058,7 @@ def write_representativity_report(
     output_dir.mkdir(parents=True, exist_ok=True)
     path = output_dir / "representativity.md"
 
-    stats = _compute_representativity_stats(df, profit_target, max_trade_duration)
+    stats = _compute_representativity_stats(df, profit_target)
 
     with path.open("w", encoding="utf-8") as h:
         h.write("# Representativity diagnostics\n\n")
@@ -916,7 +1089,24 @@ def write_representativity_report(
         )
 
 
-def model_analysis(df: pd.DataFrame, output_dir: Path, seed: int) -> None:
+def _perform_feature_analysis(
+    df: pd.DataFrame, seed: int
+) -> Tuple[
+    pd.DataFrame, Dict[str, Any], Dict[str, pd.DataFrame], RandomForestRegressor
+]:
+    """Run RandomForest-based feature analysis.
+
+    Returns
+    -------
+    importance_df : pd.DataFrame
+        Permutation importance summary (mean/std per feature).
+    analysis_stats : Dict[str, Any]
+        Core diagnostics (R², sample counts, top feature & score).
+    partial_deps : Dict[str, pd.DataFrame]
+        Partial dependence data frames keyed by feature.
+    model : RandomForestRegressor
+        Fitted model instance (for optional downstream inspection).
+    """
     feature_cols = [
         "pnl",
         "trade_duration",
@@ -935,6 +1125,7 @@ def model_analysis(df: pd.DataFrame, output_dir: Path, seed: int) -> None:
         X, y, test_size=0.25, random_state=seed
     )
 
+    # Canonical RandomForest configuration - single source of truth
     model = RandomForestRegressor(
         n_estimators=400,
         max_depth=None,
@@ -943,7 +1134,7 @@ def model_analysis(df: pd.DataFrame, output_dir: Path, seed: int) -> None:
     )
     model.fit(X_train, y_train)
     y_pred = model.predict(X_test)
-    score = r2_score(y_test, y_pred)
+    r2 = r2_score(y_test, y_pred)
 
     perm = permutation_importance(
         model,
@@ -953,6 +1144,7 @@ def model_analysis(df: pd.DataFrame, output_dir: Path, seed: int) -> None:
         random_state=seed,
         n_jobs=1,
     )
+
     importance_df = (
         pd.DataFrame(
             {
@@ -965,12 +1157,50 @@ def model_analysis(df: pd.DataFrame, output_dir: Path, seed: int) -> None:
         .reset_index(drop=True)
     )
 
+    # Compute partial dependence for key features
+    partial_deps = {}
+    for feature in ["trade_duration", "idle_duration", "pnl"]:
+        pd_result = partial_dependence(
+            model,
+            X_test,
+            [feature],
+            grid_resolution=50,
+            kind="average",
+        )
+        value_key = "values" if "values" in pd_result else "grid_values"
+        values = pd_result[value_key][0]
+        averaged = pd_result["average"][0]
+        partial_deps[feature] = pd.DataFrame(
+            {feature: values, "partial_dependence": averaged}
+        )
+
+    analysis_stats = {
+        "r2_score": r2,
+        "n_features": len(feature_cols),
+        "n_samples_train": len(X_train),
+        "n_samples_test": len(X_test),
+        "top_feature": importance_df.iloc[0]["feature"],
+        "top_importance": importance_df.iloc[0]["importance_mean"],
+    }
+
+    return importance_df, analysis_stats, partial_deps, model
+
+
+def model_analysis(df: pd.DataFrame, output_dir: Path, seed: int) -> None:
+    """Legacy wrapper for backward compatibility."""
+    importance_df, analysis_stats, partial_deps, model = _perform_feature_analysis(
+        df, seed
+    )
+
+    # Save feature importance
     importance_df.to_csv(output_dir / "feature_importance.csv", index=False)
+
+    # Save diagnostics
     diagnostics_path = output_dir / "model_diagnostics.md"
     top_features = importance_df.head(10)
     with diagnostics_path.open("w", encoding="utf-8") as handle:
         handle.write("# Random forest diagnostics\n\n")
-        handle.write(f"R^2 score on hold-out set: {score:.4f}\n\n")
+        handle.write(f"R^2 score on hold-out set: {analysis_stats['r2_score']:.4f}\n\n")
         handle.write("## Feature importance (top 10)\n\n")
         handle.write(top_features.to_string(index=False))
         handle.write("\n\n")
@@ -979,18 +1209,8 @@ def model_analysis(df: pd.DataFrame, output_dir: Path, seed: int) -> None:
             "idle_duration, and pnl.\n"
         )
 
-    for feature in ["trade_duration", "idle_duration", "pnl"]:
-        pd_result = partial_dependence(
-            model,
-            X_test,
-            [feature],
-            grid_resolution=50,
-            kind="average",
-        )
-        value_key = "values" if "values" in pd_result else "grid_values"
-        values = pd_result[value_key][0]
-        averaged = pd_result["average"][0]
-        pd_df = pd.DataFrame({feature: values, "partial_dependence": averaged})
+    # Save partial dependence data
+    for feature, pd_df in partial_deps.items():
         pd_df.to_csv(
             output_dir / f"partial_dependence_{feature}.csv",
             index=False,
@@ -1033,6 +1253,17 @@ def compute_distribution_shift_metrics(
 
         min_val = min(synth_values.min(), real_values.min())
         max_val = max(synth_values.max(), real_values.max())
+        # Guard against degenerate distributions (all values identical)
+        if not np.isfinite(min_val) or not np.isfinite(max_val):
+            continue
+        if math.isclose(max_val, min_val, rel_tol=0, abs_tol=1e-12):
+            # All mass at a single point -> shift metrics are all zero by definition
+            metrics[f"{feature}_kl_divergence"] = 0.0
+            metrics[f"{feature}_js_distance"] = 0.0
+            metrics[f"{feature}_wasserstein"] = 0.0
+            metrics[f"{feature}_ks_statistic"] = 0.0
+            metrics[f"{feature}_ks_pvalue"] = 1.0
+            continue
         bins = np.linspace(min_val, max_val, 50)
 
         # Use density=False to get counts, then normalize to probabilities
@@ -1059,11 +1290,62 @@ def compute_distribution_shift_metrics(
         metrics[f"{feature}_ks_statistic"] = float(ks_stat)
         metrics[f"{feature}_ks_pvalue"] = float(ks_pval)
 
+    # Validate distribution shift metrics bounds
+    _validate_distribution_metrics(metrics)
+
     return metrics
 
 
-def statistical_hypothesis_tests(df: pd.DataFrame) -> Dict[str, Any]:
-    """Statistical hypothesis tests (Spearman, Kruskal-Wallis, Mann-Whitney)."""
+def _validate_distribution_metrics(metrics: Dict[str, float]) -> None:
+    """Validate mathematical bounds of distribution shift metrics."""
+    for key, value in metrics.items():
+        if not np.isfinite(value):
+            raise AssertionError(f"Distribution metric {key} is not finite: {value}")
+
+        # KL divergence must be >= 0
+        if "kl_divergence" in key and value < 0:
+            raise AssertionError(f"KL divergence {key} must be >= 0, got {value:.6f}")
+
+        # JS distance must be in [0, 1]
+        if "js_distance" in key:
+            if not (0 <= value <= 1):
+                raise AssertionError(
+                    f"JS distance {key} must be in [0,1], got {value:.6f}"
+                )
+
+        # Wasserstein distance must be >= 0
+        if "wasserstein" in key and value < 0:
+            raise AssertionError(
+                f"Wasserstein distance {key} must be >= 0, got {value:.6f}"
+            )
+
+        # KS statistic must be in [0, 1]
+        if "ks_statistic" in key:
+            if not (0 <= value <= 1):
+                raise AssertionError(
+                    f"KS statistic {key} must be in [0,1], got {value:.6f}"
+                )
+
+        # p-values must be in [0, 1]
+        if "pvalue" in key:
+            if not (0 <= value <= 1):
+                raise AssertionError(f"p-value {key} must be in [0,1], got {value:.6f}")
+
+
+def statistical_hypothesis_tests(
+    df: pd.DataFrame, *, adjust_method: str = "none", seed: int = 42
+) -> Dict[str, Any]:
+    """Statistical hypothesis tests (Spearman, Kruskal-Wallis, Mann-Whitney).
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        Synthetic samples.
+    adjust_method : {'none','benjamini_hochberg'}
+        Optional p-value multiple test correction.
+    seed : int
+        Random seed for bootstrap resampling.
+    """
     results = {}
     alpha = 0.05
 
@@ -1078,12 +1360,19 @@ def statistical_hypothesis_tests(df: pd.DataFrame) -> Dict[str, Any]:
         # Bootstrap CI: resample pairs to preserve bivariate structure
         bootstrap_rhos = []
         n_boot = 1000
+        rng = np.random.default_rng(seed)
         for _ in range(n_boot):
-            idx = np.random.choice(len(idle_dur), size=len(idle_dur), replace=True)
+            idx = rng.choice(len(idle_dur), size=len(idle_dur), replace=True)
             boot_rho, _ = stats.spearmanr(idle_dur.iloc[idx], idle_rew.iloc[idx])
-            bootstrap_rhos.append(boot_rho)
+            # Handle NaN case if all values identical in bootstrap sample
+            if np.isfinite(boot_rho):
+                bootstrap_rhos.append(boot_rho)
 
-        ci_low, ci_high = np.percentile(bootstrap_rhos, [2.5, 97.5])
+        if len(bootstrap_rhos) > 0:
+            ci_low, ci_high = np.percentile(bootstrap_rhos, [2.5, 97.5])
+        else:
+            # Fallback if no valid bootstrap samples
+            ci_low, ci_high = np.nan, np.nan
 
         results["idle_correlation"] = {
             "test": "Spearman rank correlation",
@@ -1170,14 +1459,99 @@ def statistical_hypothesis_tests(df: pd.DataFrame) -> Dict[str, Any]:
             "median_pnl_negative": float(pnl_negative.median()),
         }
 
+    # Optional multiple testing correction (Benjamini-Hochberg)
+    if adjust_method not in {"none", "benjamini_hochberg"}:
+        raise ValueError(
+            "Unsupported adjust_method. Use 'none' or 'benjamini_hochberg'."
+        )
+    if adjust_method == "benjamini_hochberg" and results:
+        # Collect p-values
+        items = list(results.items())
+        pvals = np.array([v[1]["p_value"] for v in items])
+        m = len(pvals)
+        order = np.argsort(pvals)
+        ranks = np.empty_like(order)
+        ranks[order] = np.arange(1, m + 1)
+        adj = pvals * m / ranks
+        # enforce monotonicity
+        adj_sorted = np.minimum.accumulate(adj[order][::-1])[::-1]
+        adj_final = np.empty_like(adj_sorted)
+        adj_final[order] = np.clip(adj_sorted, 0, 1)
+        # Attach adjusted p-values and recompute significance
+        for (name, res), p_adj in zip(items, adj_final):
+            res["p_value_adj"] = float(p_adj)
+            res["significant_adj"] = bool(p_adj < alpha)
+            results[name] = res
+
+    # Validate hypothesis test results
+    _validate_hypothesis_test_results(results)
+
     return results
 
 
+def _validate_hypothesis_test_results(results: Dict[str, Any]) -> None:
+    """Validate statistical properties of hypothesis test results."""
+    for test_name, result in results.items():
+        # All p-values must be in [0, 1] or NaN (for cases like constant input)
+        if "p_value" in result:
+            p_val = result["p_value"]
+            if not (np.isnan(p_val) or (0 <= p_val <= 1)):
+                raise AssertionError(
+                    f"Invalid p-value for {test_name}: {p_val:.6f} not in [0,1] or NaN"
+                )
+
+        # Adjusted p-values must also be in [0, 1] or NaN
+        if "p_value_adj" in result:
+            p_adj = result["p_value_adj"]
+            if not (np.isnan(p_adj) or (0 <= p_adj <= 1)):
+                raise AssertionError(
+                    f"Invalid adjusted p-value for {test_name}: {p_adj:.6f} not in [0,1] or NaN"
+                )
+
+        # Effect sizes must be finite and in valid ranges
+        if "effect_size_epsilon_sq" in result:
+            epsilon_sq = result["effect_size_epsilon_sq"]
+            if not np.isfinite(epsilon_sq) or epsilon_sq < 0:
+                raise AssertionError(
+                    f"Invalid ε² for {test_name}: {epsilon_sq:.6f} (must be finite and >= 0)"
+                )
+
+        if "effect_size_rank_biserial" in result:
+            rb_corr = result["effect_size_rank_biserial"]
+            if not np.isfinite(rb_corr) or not (-1 <= rb_corr <= 1):
+                raise AssertionError(
+                    f"Invalid rank-biserial correlation for {test_name}: {rb_corr:.6f} "
+                    f"(must be finite and in [-1,1])"
+                )
+
+        # Correlation coefficients must be in [-1, 1]
+        if "rho" in result:
+            rho = result["rho"]
+            if np.isfinite(rho) and not (-1 <= rho <= 1):
+                raise AssertionError(
+                    f"Invalid correlation coefficient for {test_name}: {rho:.6f} "
+                    f"not in [-1,1]"
+                )
+
+        # Confidence intervals must be properly ordered
+        if (
+            "ci_95" in result
+            and isinstance(result["ci_95"], (tuple, list))
+            and len(result["ci_95"]) == 2
+        ):
+            ci_low, ci_high = result["ci_95"]
+            if np.isfinite(ci_low) and np.isfinite(ci_high) and ci_low > ci_high:
+                raise AssertionError(
+                    f"Invalid CI ordering for {test_name}: [{ci_low:.6f}, {ci_high:.6f}]"
+                )
+
+
 def bootstrap_confidence_intervals(
     df: pd.DataFrame,
     metrics: List[str],
     n_bootstrap: int = 10000,
     confidence_level: float = 0.95,
+    seed: int = 42,
 ) -> Dict[str, Tuple[float, float, float]]:
     """Bootstrap confidence intervals for key metrics."""
     alpha = 1 - confidence_level
@@ -1186,6 +1560,9 @@ def bootstrap_confidence_intervals(
 
     results = {}
 
+    # Local RNG to avoid mutating global NumPy RNG state
+    rng = np.random.default_rng(seed)
+
     for metric in metrics:
         if metric not in df.columns:
             continue
@@ -1197,21 +1574,62 @@ def bootstrap_confidence_intervals(
         point_est = float(data.mean())
 
         bootstrap_means = []
+        data_array = data.values  # speed
+        n = len(data_array)
         for _ in range(n_bootstrap):
-            sample = data.sample(n=len(data), replace=True)
-            bootstrap_means.append(sample.mean())
+            indices = rng.integers(0, n, size=n)
+            bootstrap_means.append(float(data_array[indices].mean()))
 
         ci_low = float(np.percentile(bootstrap_means, lower_percentile))
         ci_high = float(np.percentile(bootstrap_means, upper_percentile))
 
         results[metric] = (point_est, ci_low, ci_high)
 
+    # Validate bootstrap confidence intervals
+    _validate_bootstrap_results(results)
+
     return results
 
 
-def distribution_diagnostics(df: pd.DataFrame) -> Dict[str, Any]:
-    """Distribution diagnostics: normality tests, skewness, kurtosis."""
+def _validate_bootstrap_results(results: Dict[str, Tuple[float, float, float]]) -> None:
+    """Validate mathematical properties of bootstrap confidence intervals."""
+    for metric, (mean, ci_low, ci_high) in results.items():
+        # CI bounds must be finite
+        if not (np.isfinite(mean) and np.isfinite(ci_low) and np.isfinite(ci_high)):
+            raise AssertionError(
+                f"Bootstrap CI for {metric}: non-finite values "
+                f"(mean={mean}, ci_low={ci_low}, ci_high={ci_high})"
+            )
+
+        # CI must be properly ordered
+        if not (ci_low <= mean <= ci_high):
+            raise AssertionError(
+                f"Bootstrap CI for {metric}: ordering violation "
+                f"({ci_low:.6f} <= {mean:.6f} <= {ci_high:.6f})"
+            )
+
+        # CI width should be positive (non-degenerate)
+        width = ci_high - ci_low
+        if width <= 0:
+            raise AssertionError(
+                f"Bootstrap CI for {metric}: non-positive width {width:.6f}"
+            )
+
+
+def distribution_diagnostics(
+    df: pd.DataFrame, *, seed: int | None = None
+) -> Dict[str, Any]:
+    """Distribution diagnostics: normality tests, skewness, kurtosis.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        Input samples.
+    seed : int | None, optional
+        Reserved for future stochastic diagnostic extensions; currently unused.
+    """
     diagnostics = {}
+    _ = seed  # placeholder to keep signature for future reproducibility extensions
 
     for col in ["reward_total", "pnl", "trade_duration", "idle_duration"]:
         if col not in df.columns:
@@ -1243,22 +1661,63 @@ def distribution_diagnostics(df: pd.DataFrame) -> Dict[str, Any]:
 
         from scipy.stats import probplot
 
-        (osm, osr), (slope, intercept, r) = probplot(data, dist="norm", plot=None)
+        (_osm, _osr), (_slope, _intercept, r) = probplot(data, dist="norm", plot=None)
         diagnostics[f"{col}_qq_r_squared"] = float(r**2)
 
+    _validate_distribution_diagnostics(diagnostics)
     return diagnostics
 
 
+def _validate_distribution_diagnostics(diag: Dict[str, Any]) -> None:
+    """Validate mathematical properties of distribution diagnostics.
+
+    Ensures all reported statistics are finite and within theoretical bounds where applicable.
+    Invoked automatically inside distribution_diagnostics(); raising AssertionError on violation
+    enforces fail-fast semantics consistent with other validation helpers.
+    """
+    for key, value in diag.items():
+        if any(suffix in key for suffix in ["_mean", "_std", "_skewness", "_kurtosis"]):
+            if not np.isfinite(value):
+                raise AssertionError(
+                    f"Distribution diagnostic {key} is not finite: {value}"
+                )
+        if key.endswith("_shapiro_pval"):
+            if not (0 <= value <= 1):
+                raise AssertionError(
+                    f"Shapiro p-value {key} must be in [0,1], got {value}"
+                )
+        if key.endswith("_anderson_stat") or key.endswith("_anderson_critical_5pct"):
+            if not np.isfinite(value):
+                raise AssertionError(
+                    f"Anderson statistic {key} must be finite, got {value}"
+                )
+        if key.endswith("_qq_r_squared"):
+            if not (0 <= value <= 1):
+                raise AssertionError(f"Q-Q R^2 {key} must be in [0,1], got {value}")
+
+
 def write_enhanced_statistical_report(
     df: pd.DataFrame,
     output_dir: Path,
     real_df: Optional[pd.DataFrame] = None,
+    *,
+    adjust_method: str = "none",
 ) -> None:
     """Generate enhanced statistical report with hypothesis tests and CI."""
     output_dir.mkdir(parents=True, exist_ok=True)
     report_path = output_dir / "enhanced_statistical_report.md"
 
-    hypothesis_tests = statistical_hypothesis_tests(df)
+    # Derive a deterministic seed for statistical tests: prefer provided seed if present in df attrs else fallback
+    test_seed = 42
+    if (
+        hasattr(df, "attrs")
+        and "seed" in df.attrs
+        and isinstance(df.attrs["seed"], int)
+    ):
+        test_seed = int(df.attrs["seed"])
+    hypothesis_tests = statistical_hypothesis_tests(
+        df, adjust_method=adjust_method, seed=test_seed
+    )
 
     metrics_for_ci = [
         "reward_total",
@@ -1287,6 +1746,10 @@ def write_enhanced_statistical_report(
                 f"- **Statistic:** {test_result.get('statistic', test_result.get('rho', 'N/A')):.4f}\n"
             )
             f.write(f"- **p-value:** {test_result['p_value']:.4e}\n")
+            if "p_value_adj" in test_result:
+                f.write(
+                    f"- **p-value (adj BH):** {test_result['p_value_adj']:.4e} -> {'✅' if test_result['significant_adj'] else '❌'} (α=0.05)\n"
+                )
             f.write(
                 f"- **Significant (α=0.05):** {'✅ Yes' if test_result['significant'] else '❌ No'}\n"
             )
@@ -1411,6 +1874,12 @@ def build_argument_parser() -> argparse.ArgumentParser:
         default=42,
         help="Random seed for reproducibility (default: 42).",
     )
+    parser.add_argument(
+        "--stats_seed",
+        type=int,
+        default=None,
+        help="Optional separate seed for statistical analyses (default: same as --seed).",
+    )
     parser.add_argument(
         "--max_trade_duration",
         type=int,
@@ -1441,6 +1910,18 @@ def build_argument_parser() -> argparse.ArgumentParser:
         default=2.5,
         help="Multiple of max duration used when sampling trade/idle durations.",
     )
+    parser.add_argument(
+        "--pnl_base_std",
+        type=float,
+        default=0.02,
+        help="Base standard deviation for synthetic PnL generation (pre-scaling).",
+    )
+    parser.add_argument(
+        "--pnl_duration_vol_scale",
+        type=float,
+        default=0.5,
+        help="Scaling factor of additional PnL volatility proportional to trade duration ratio.",
+    )
     parser.add_argument(
         "--trading_mode",
         type=str.lower,
@@ -1476,6 +1957,13 @@ def build_argument_parser() -> argparse.ArgumentParser:
         default=None,
         help="Path to real episodes pickle for distribution shift analysis (optional).",
     )
+    parser.add_argument(
+        "--pvalue_adjust",
+        type=str.lower,
+        choices=["none", "benjamini_hochberg"],
+        default="none",
+        help="Multiple testing correction method for hypothesis tests (default: none).",
+    )
     return parser
 
 
@@ -1486,6 +1974,9 @@ def write_complete_statistical_analysis(
     profit_target: float,
     seed: int,
     real_df: Optional[pd.DataFrame] = None,
+    *,
+    adjust_method: str = "none",
+    stats_seed: Optional[int] = None,
 ) -> None:
     """Generate a single comprehensive statistical analysis report with enhanced tests."""
     output_dir.mkdir(parents=True, exist_ok=True)
@@ -1513,7 +2004,7 @@ def write_complete_statistical_analysis(
         return "\n".join(lines) + "\n\n"
 
     def _df_to_md(df: pd.DataFrame, index_name: str = "index", ndigits: int = 6) -> str:
-        if df is None or len(df) == 0:
+        if df is None or df.empty:
             return "_No data._\n\n"
         # Prepare header
         cols = list(df.columns)
@@ -1537,77 +2028,29 @@ def write_complete_statistical_analysis(
     )
 
     # Model analysis
-    feature_cols = [
-        "pnl",
-        "trade_duration",
-        "idle_duration",
-        "duration_ratio",
-        "idle_ratio",
-        "position",
-        "action",
-        "force_action",
-        "is_force_exit",
-        "is_invalid",
-    ]
-    X = df[feature_cols]
-    y = df["reward_total"]
-    X_train, X_test, y_train, y_test = train_test_split(
-        X, y, test_size=0.25, random_state=seed
-    )
-
-    model = RandomForestRegressor(
-        n_estimators=400,
-        max_depth=None,
-        random_state=seed,
-        n_jobs=1,
-    )
-    model.fit(X_train, y_train)
-    y_pred = model.predict(X_test)
-    r2 = r2_score(y_test, y_pred)
-
-    perm = permutation_importance(
-        model,
-        X_test,
-        y_test,
-        n_repeats=25,
-        random_state=seed,
-        n_jobs=1,
-    )
-    importance_df = (
-        pd.DataFrame(
-            {
-                "feature": feature_cols,
-                "importance_mean": perm.importances_mean,
-                "importance_std": perm.importances_std,
-            }
-        )
-        .sort_values("importance_mean", ascending=False)
-        .reset_index(drop=True)
+    importance_df, analysis_stats, partial_deps, _model = _perform_feature_analysis(
+        df, seed
     )
 
     # Save feature importance CSV
     importance_df.to_csv(output_dir / "feature_importance.csv", index=False)
 
     # Save partial dependence CSVs
-    for feature in ["trade_duration", "idle_duration", "pnl"]:
-        pd_result = partial_dependence(
-            model,
-            X_test,
-            [feature],
-            grid_resolution=50,
-            kind="average",
-        )
-        value_key = "values" if "values" in pd_result else "grid_values"
-        values = pd_result[value_key][0]
-        averaged = pd_result["average"][0]
-        pd_df = pd.DataFrame({feature: values, "partial_dependence": averaged})
+    for feature, pd_df in partial_deps.items():
         pd_df.to_csv(
             output_dir / f"partial_dependence_{feature}.csv",
             index=False,
         )
 
     # Enhanced statistics (always computed)
-    hypothesis_tests = statistical_hypothesis_tests(df)
+    test_seed = (
+        stats_seed
+        if isinstance(stats_seed, int)
+        else (seed if isinstance(seed, int) else 42)
+    )
+    hypothesis_tests = statistical_hypothesis_tests(
+        df, adjust_method=adjust_method, seed=test_seed
+    )
     metrics_for_ci = [
         "reward_total",
         "reward_idle",
@@ -1615,8 +2058,10 @@ def write_complete_statistical_analysis(
         "reward_exit",
         "pnl",
     ]
-    bootstrap_ci = bootstrap_confidence_intervals(df, metrics_for_ci, n_bootstrap=10000)
-    dist_diagnostics = distribution_diagnostics(df)
+    bootstrap_ci = bootstrap_confidence_intervals(
+        df, metrics_for_ci, n_bootstrap=10000, seed=test_seed
+    )
+    dist_diagnostics = distribution_diagnostics(df, seed=test_seed)
 
     distribution_shift = None
     if real_df is not None:
@@ -1767,7 +2212,7 @@ def write_complete_statistical_analysis(
             "Machine learning analysis to identify which features most influence total reward.\n\n"
         )
         f.write("**Model:** Random Forest Regressor (400 trees)  \n")
-        f.write(f"**R² Score:** {r2:.4f}\n\n")
+        f.write(f"**R² Score:** {analysis_stats['r2_score']:.4f}\n\n")
 
         f.write("### 4.1 Top 10 Features by Importance\n\n")
         top_imp = importance_df.head(10).copy().reset_index(drop=True)
@@ -1800,6 +2245,10 @@ def write_complete_statistical_analysis(
                 f.write(f"**Test Method:** {h['test']}\n\n")
                 f.write(f"- Spearman ρ: **{h['rho']:.4f}**\n")
                 f.write(f"- p-value: {h['p_value']:.4g}\n")
+                if "p_value_adj" in h:
+                    f.write(
+                        f"- p-value (adj BH): {h['p_value_adj']:.4g} -> {'✅' if h['significant_adj'] else '❌'} (α=0.05)\n"
+                    )
                 f.write(f"- 95% CI: [{h['ci_95'][0]:.4f}, {h['ci_95'][1]:.4f}]\n")
                 f.write(f"- Sample size: {h['n_samples']:,}\n")
                 f.write(
@@ -1813,6 +2262,10 @@ def write_complete_statistical_analysis(
                 f.write(f"**Test Method:** {h['test']}\n\n")
                 f.write(f"- H-statistic: **{h['statistic']:.4f}**\n")
                 f.write(f"- p-value: {h['p_value']:.4g}\n")
+                if "p_value_adj" in h:
+                    f.write(
+                        f"- p-value (adj BH): {h['p_value_adj']:.4g} -> {'✅' if h['significant_adj'] else '❌'} (α=0.05)\n"
+                    )
                 f.write(f"- Effect size (ε²): {h['effect_size_epsilon_sq']:.4f}\n")
                 f.write(f"- Number of groups: {h['n_groups']}\n")
                 f.write(
@@ -1826,6 +2279,10 @@ def write_complete_statistical_analysis(
                 f.write(f"**Test Method:** {h['test']}\n\n")
                 f.write(f"- U-statistic: **{h['statistic']:.4f}**\n")
                 f.write(f"- p-value: {h['p_value']:.4g}\n")
+                if "p_value_adj" in h:
+                    f.write(
+                        f"- p-value (adj BH): {h['p_value_adj']:.4g} -> {'✅' if h['significant_adj'] else '❌'} (α=0.05)\n"
+                    )
                 f.write(
                     f"- Effect size (rank-biserial): {h['effect_size_rank_biserial']:.4f}\n"
                 )
@@ -1841,6 +2298,10 @@ def write_complete_statistical_analysis(
                 f.write(f"**Test Method:** {h['test']}\n\n")
                 f.write(f"- U-statistic: **{h['statistic']:.4f}**\n")
                 f.write(f"- p-value: {h['p_value']:.4g}\n")
+                if "p_value_adj" in h:
+                    f.write(
+                        f"- p-value (adj BH): {h['p_value_adj']:.4g} -> {'✅' if h['significant_adj'] else '❌'} (α=0.05)\n"
+                    )
                 f.write(f"- Median (PnL+): {h['median_pnl_positive']:.4f}\n")
                 f.write(f"- Median (PnL-): {h['median_pnl_negative']:.4f}\n")
                 f.write(
@@ -1971,6 +2432,10 @@ def main() -> None:
     # Then apply --params KEY=VALUE overrides (highest precedence)
     params.update(parse_overrides(args.params))
 
+    # Early parameter validation (moved before simulation for alignment with docs)
+    params_validated, adjustments = validate_reward_parameters(params)
+    params = params_validated
+
     base_factor = float(params.get("base_factor", args.base_factor))
     profit_target = float(params.get("profit_target", args.profit_target))
     rr_override = params.get("rr")
@@ -1984,6 +2449,10 @@ def main() -> None:
     else:
         params["action_masking"] = cli_action_masking
 
+    # Deterministic seeds cascade
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+
     df = simulate_samples(
         num_samples=args.num_samples,
         seed=args.seed,
@@ -1994,7 +2463,23 @@ def main() -> None:
         risk_reward_ratio=risk_reward_ratio,
         holding_max_ratio=args.holding_max_ratio,
         trading_mode=args.trading_mode,
+        pnl_base_std=args.pnl_base_std,
+        pnl_duration_vol_scale=args.pnl_duration_vol_scale,
     )
+    # Attach simulation parameters for downstream manifest
+    df.attrs["simulation_params"] = {
+        "num_samples": args.num_samples,
+        "seed": args.seed,
+        "max_trade_duration": args.max_trade_duration,
+        "base_factor": base_factor,
+        "profit_target": profit_target,
+        "risk_reward_ratio": risk_reward_ratio,
+        "holding_max_ratio": args.holding_max_ratio,
+        "trading_mode": args.trading_mode,
+        "action_masking": params.get("action_masking", True),
+        "pnl_base_std": args.pnl_base_std,
+        "pnl_duration_vol_scale": args.pnl_duration_vol_scale,
+    }
 
     args.output.mkdir(parents=True, exist_ok=True)
     csv_path = args.output / "reward_samples.csv"
@@ -2009,6 +2494,7 @@ def main() -> None:
 
     # Generate consolidated statistical analysis report (with enhanced tests)
     print("Generating complete statistical analysis...")
+
     write_complete_statistical_analysis(
         df,
         args.output,
@@ -2016,10 +2502,64 @@ def main() -> None:
         profit_target=float(profit_target * risk_reward_ratio),
         seed=args.seed,
         real_df=real_df,
+        adjust_method=args.pvalue_adjust,
+        stats_seed=args.stats_seed
+        if getattr(args, "stats_seed", None) is not None
+        else None,
     )
     print(
         f"Complete statistical analysis saved to: {args.output / 'statistical_analysis.md'}"
     )
+    # Generate manifest summarizing key metrics
+    try:
+        manifest_path = args.output / "manifest.json"
+        if (args.output / "feature_importance.csv").exists():
+            fi_df = pd.read_csv(args.output / "feature_importance.csv")
+            top_features = fi_df.head(5)["feature"].tolist()
+        else:
+            top_features = []
+        # Detect reward parameter overrides vs defaults for traceability
+        reward_param_overrides = {
+            k: params[k]
+            for k in DEFAULT_MODEL_REWARD_PARAMETERS
+            if k in params and params[k] != DEFAULT_MODEL_REWARD_PARAMETERS[k]
+        }
+
+        manifest = {
+            "generated_at": pd.Timestamp.now().isoformat(),
+            "num_samples": int(len(df)),
+            "seed": int(args.seed),
+            "max_trade_duration": int(args.max_trade_duration),
+            "profit_target_effective": float(profit_target * risk_reward_ratio),
+            "top_features": top_features,
+            "reward_param_overrides": reward_param_overrides,
+            "pvalue_adjust_method": args.pvalue_adjust,
+            "parameter_adjustments": adjustments,
+        }
+        sim_params = df.attrs.get("simulation_params", {})
+        if isinstance(sim_params, dict) and sim_params:
+            import hashlib as _hashlib
+            import json as _json
+
+            _hash_source = {
+                **{f"sim::{k}": sim_params[k] for k in sorted(sim_params)},
+                **{
+                    f"reward::{k}": reward_param_overrides[k]
+                    for k in sorted(reward_param_overrides)
+                },
+            }
+            serialized = _json.dumps(_hash_source, sort_keys=True)
+            manifest["params_hash"] = _hashlib.sha256(
+                serialized.encode("utf-8")
+            ).hexdigest()
+            manifest["params"] = sim_params
+        with manifest_path.open("w", encoding="utf-8") as mh:
+            import json as _json
+
+            _json.dump(manifest, mh, indent=2)
+        print(f"Manifest written to: {manifest_path}")
+    except Exception as e:
+        print(f"Manifest generation failed: {e}")
 
     print(f"Generated {len(df):,} synthetic samples.")
     print(sample_output_message)
diff --git a/ReforceXY/reward_space_analysis/test_reward_alignment.py b/ReforceXY/reward_space_analysis/test_reward_alignment.py
deleted file mode 100644 (file)
index 2239f88..0000000
+++ /dev/null
@@ -1,322 +0,0 @@
-#!/usr/bin/env python3
-"""
-Comprehensive regression test for reward_space_analysis.py
-
-This script checks critical aspects of alignment with MyRLEnv.
-Use this test to validate future changes and avoid regressions.
-
-Usage:
-    python test_reward_alignment.py
-"""
-
-import sys
-
-from reward_space_analysis import (
-    DEFAULT_MODEL_REWARD_PARAMETERS,
-    Actions,
-    ForceActions,
-    Positions,
-    RewardContext,
-    _get_exit_factor,
-    _get_pnl_factor,
-    _is_valid_action,
-    calculate_reward,
-)
-
-
-def test_enums():
-    """Ensure enums have the expected values"""
-    print("Testing enums...")
-    assert Actions.Neutral.value == 0
-    assert Actions.Long_enter.value == 1
-    assert Actions.Long_exit.value == 2
-    assert Actions.Short_enter.value == 3
-    assert Actions.Short_exit.value == 4
-
-    assert Positions.Short.value == 0
-    assert Positions.Long.value == 1
-    assert Positions.Neutral.value == 0.5
-
-    assert ForceActions.Take_profit.value == 0
-    assert ForceActions.Stop_loss.value == 1
-    assert ForceActions.Timeout.value == 2
-    print("  ✅ Enums OK")
-
-
-def test_default_parameters():
-    """Ensure default parameters are correct"""
-    print("Testing default parameters...")
-    assert DEFAULT_MODEL_REWARD_PARAMETERS["base_factor"] == 100.0
-    assert DEFAULT_MODEL_REWARD_PARAMETERS["idle_penalty_scale"] == 1.0
-    assert DEFAULT_MODEL_REWARD_PARAMETERS["idle_penalty_power"] == 1.0
-    assert DEFAULT_MODEL_REWARD_PARAMETERS["holding_penalty_scale"] == 0.3
-    assert DEFAULT_MODEL_REWARD_PARAMETERS["holding_penalty_power"] == 1.0
-    assert DEFAULT_MODEL_REWARD_PARAMETERS["holding_duration_ratio_grace"] == 1.0
-    # Ensure max_idle_duration_candles is NOT in defaults
-    assert "max_idle_duration_candles" not in DEFAULT_MODEL_REWARD_PARAMETERS
-    print("  ✅ Default parameters OK")
-
-
-def test_invalid_action_penalty():
-    """Ensure invalid action penalty is applied correctly"""
-    print("Testing invalid action penalty...")
-    ctx = RewardContext(
-        pnl=0.02,
-        trade_duration=50,
-        idle_duration=0,
-        max_trade_duration=100,
-        max_unrealized_profit=0.03,
-        min_unrealized_profit=0.01,
-        position=Positions.Neutral,
-        action=Actions.Long_exit,  # Invalid: Long_exit in Neutral
-        force_action=None,
-    )
-
-    # Without masking → penalty
-    bd_no_mask = calculate_reward(
-        ctx,
-        DEFAULT_MODEL_REWARD_PARAMETERS,
-        base_factor=100.0,
-        profit_target=0.03,
-        risk_reward_ratio=2.0,
-        short_allowed=True,
-        action_masking=False,
-    )
-    assert bd_no_mask.invalid_penalty == -2.0
-    assert bd_no_mask.total == -2.0
-
-    # With masking → no penalty
-    bd_with_mask = calculate_reward(
-        ctx,
-        DEFAULT_MODEL_REWARD_PARAMETERS,
-        base_factor=100.0,
-        profit_target=0.03,
-        risk_reward_ratio=2.0,
-        short_allowed=True,
-        action_masking=True,
-    )
-    assert bd_with_mask.invalid_penalty == 0.0
-    assert bd_with_mask.total == 0.0
-    print("  ✅ Invalid action penalty OK")
-
-
-def test_idle_penalty_no_capping():
-    """Ensure idle penalty has NO capping"""
-    print("Testing idle penalty (no capping)...")
-    ctx = RewardContext(
-        pnl=0.0,
-        trade_duration=0,
-        idle_duration=200,  # ratio = 2.0
-        max_trade_duration=100,
-        max_unrealized_profit=0.0,
-        min_unrealized_profit=0.0,
-        position=Positions.Neutral,
-        action=Actions.Neutral,
-        force_action=None,
-    )
-    bd = calculate_reward(
-        ctx,
-        DEFAULT_MODEL_REWARD_PARAMETERS,
-        base_factor=100.0,
-        profit_target=0.03,
-        risk_reward_ratio=2.0,
-        short_allowed=True,
-        action_masking=True,
-    )
-    # idle_factor = (100.0 * 0.03 * 2.0) / 3.0 = 2.0
-    # idle_penalty = -2.0 * 1.0 * 2.0^1.0 = -4.0
-    assert abs(bd.idle_penalty - (-4.0)) < 0.001
-    print("  ✅ Idle penalty (no capping) OK")
-
-
-def test_force_exit():
-    """Ensure force exits take priority"""
-    print("Testing force exit...")
-    ctx = RewardContext(
-        pnl=0.04,
-        trade_duration=50,
-        idle_duration=0,
-        max_trade_duration=100,
-        max_unrealized_profit=0.05,
-        min_unrealized_profit=0.03,
-        position=Positions.Long,
-        action=Actions.Long_exit,
-        force_action=ForceActions.Take_profit,
-    )
-    bd = calculate_reward(
-        ctx,
-        DEFAULT_MODEL_REWARD_PARAMETERS,
-        base_factor=100.0,
-        profit_target=0.03,
-        risk_reward_ratio=2.0,
-        short_allowed=True,
-        action_masking=True,
-    )
-    assert bd.exit_component > 0
-    assert bd.total == bd.exit_component
-    print("  ✅ Force exit OK")
-
-
-def test_holding_penalty_grace():
-    """Ensure grace period works correctly"""
-    print("Testing holding penalty with grace period...")
-
-    # Case 1: Below target, within grace → reward = 0
-    ctx1 = RewardContext(
-        pnl=0.04,  # Below target (0.06)
-        trade_duration=50,
-        idle_duration=0,
-        max_trade_duration=100,
-        max_unrealized_profit=0.05,
-        min_unrealized_profit=0.03,
-        position=Positions.Long,
-        action=Actions.Neutral,
-        force_action=None,
-    )
-    bd1 = calculate_reward(
-        ctx1,
-        DEFAULT_MODEL_REWARD_PARAMETERS,
-        base_factor=100.0,
-        profit_target=0.03,
-        risk_reward_ratio=2.0,
-        short_allowed=True,
-        action_masking=True,
-    )
-    assert bd1.total == 0.0
-
-    ctx2 = RewardContext(
-        pnl=0.08,
-        trade_duration=50,
-        idle_duration=0,
-        max_trade_duration=100,
-        max_unrealized_profit=0.10,
-        min_unrealized_profit=0.05,
-        position=Positions.Long,
-        action=Actions.Neutral,
-        force_action=None,
-    )
-    bd2 = calculate_reward(
-        ctx2,
-        DEFAULT_MODEL_REWARD_PARAMETERS,
-        base_factor=100.0,
-        profit_target=0.03,
-        risk_reward_ratio=2.0,
-        short_allowed=True,
-        action_masking=True,
-    )
-    assert bd2.holding_penalty < 0
-    print("  ✅ Holding penalty with grace OK")
-
-
-def test_exit_factor():
-    """Ensure exit_factor is computed correctly"""
-    print("Testing exit factor...")
-    params = DEFAULT_MODEL_REWARD_PARAMETERS.copy()
-    params["exit_reward_mode"] = "piecewise"
-    factor = _get_exit_factor(
-        factor=100.0, pnl=0.05, pnl_factor=1.5, duration_ratio=0.5, params=params
-    )
-    # Piecewise mode with duration_ratio=0.5 should yield 100.0 * 1.5 = 150.0
-    assert abs(factor - 150.0) < 0.1
-    print("  ✅ Exit factor OK")
-
-
-def test_pnl_factor():
-    """Ensure pnl_factor includes efficiency"""
-    print("Testing PnL factor...")
-    params = DEFAULT_MODEL_REWARD_PARAMETERS.copy()
-    context = RewardContext(
-        pnl=0.05,
-        trade_duration=50,
-        idle_duration=0,
-        max_trade_duration=100,
-        max_unrealized_profit=0.08,
-        min_unrealized_profit=0.01,
-        position=Positions.Long,
-        action=Actions.Long_exit,
-        force_action=None,
-    )
-    pnl_factor = _get_pnl_factor(params, context, profit_target=0.03)
-    # Should include amplification + efficiency
-    assert pnl_factor > 1.0  # Profit above target
-    print("  ✅ PnL factor OK")
-
-
-def test_short_exit_without_short_allowed():
-    """Ensure Short_exit is valid even if short_allowed=False"""
-    print("Testing short exit without short_allowed...")
-
-    # Validation
-    assert _is_valid_action(
-        Positions.Short, Actions.Short_exit, short_allowed=False, force_action=None
-    )
-
-    # Reward
-    ctx = RewardContext(
-        pnl=0.03,
-        trade_duration=40,
-        idle_duration=0,
-        max_trade_duration=100,
-        max_unrealized_profit=0.04,
-        min_unrealized_profit=0.02,
-        position=Positions.Short,
-        action=Actions.Short_exit,
-        force_action=None,
-    )
-    bd = calculate_reward(
-        ctx,
-        DEFAULT_MODEL_REWARD_PARAMETERS,
-        base_factor=100.0,
-        profit_target=0.03,
-        risk_reward_ratio=2.0,
-        short_allowed=False,  # Short not allowed
-        action_masking=True,
-    )
-    assert bd.invalid_penalty == 0.0
-    assert bd.exit_component > 0  # Reward positif car profit
-    print("  ✅ Short exit without short_allowed OK")
-
-
-def run_all_tests():
-    """Run all tests"""
-    print("\n" + "=" * 60)
-    print("REGRESSION TEST - REWARD SPACE ANALYSIS")
-    print("=" * 60 + "\n")
-
-    tests = [
-        test_enums,
-        test_default_parameters,
-        test_invalid_action_penalty,
-        test_idle_penalty_no_capping,
-        test_force_exit,
-        test_holding_penalty_grace,
-        test_exit_factor,
-        test_pnl_factor,
-        test_short_exit_without_short_allowed,
-    ]
-
-    failed = []
-    for test in tests:
-        try:
-            test()
-        except AssertionError as e:
-            print(f"  ❌ FAILED: {e}")
-            failed.append(test.__name__)
-        except Exception as e:
-            print(f"  ❌ ERROR: {e}")
-            failed.append(test.__name__)
-
-    print("\n" + "=" * 60)
-    if failed:
-        print(f"❌ {len(failed)} failed test(s):")
-        for name in failed:
-            print(f"   - {name}")
-        sys.exit(1)
-    else:
-        print("✅ ALL TESTS PASSED")
-        print("\nCode is aligned with MyRLEnv and ready for production.")
-    print("=" * 60 + "\n")
-
-
-if __name__ == "__main__":
-    run_all_tests()
diff --git a/ReforceXY/reward_space_analysis/test_reward_space_analysis.py b/ReforceXY/reward_space_analysis/test_reward_space_analysis.py
new file mode 100644 (file)
index 0000000..925d132
--- /dev/null
@@ -0,0 +1,1446 @@
+#!/usr/bin/env python3
+"""Unified test suite for reward space analysis.
+
+Covers:
+- Integration testing (CLI interface)
+- Statistical coherence validation
+- Reward alignment with environment
+"""
+
+import unittest
+import tempfile
+import shutil
+from pathlib import Path
+import subprocess
+import sys
+import json
+import pickle
+import numpy as np
+import pandas as pd
+
+# Import functions to test
+try:
+    import reward_space_analysis as rsa
+    from reward_space_analysis import (
+        DEFAULT_MODEL_REWARD_PARAMETERS,
+        Actions,
+        ForceActions,
+        Positions,
+        RewardContext,
+        calculate_reward,
+        compute_distribution_shift_metrics,
+        distribution_diagnostics,
+        simulate_samples,
+        bootstrap_confidence_intervals,
+        parse_overrides,
+    )
+except ImportError as e:
+    print(f"Import error: {e}")
+    sys.exit(1)
+
+
+class RewardSpaceTestBase(unittest.TestCase):
+    """Base class with common test utilities."""
+
+    @classmethod
+    def setUpClass(cls):
+        """Set up class-level constants - single source of truth."""
+        cls.SEED = 42
+        cls.DEFAULT_PARAMS = DEFAULT_MODEL_REWARD_PARAMETERS.copy()
+        cls.TEST_SAMPLES = 50  # Small for speed
+
+    def setUp(self):
+        """Set up test fixtures with reproducible random seed."""
+        np.random.seed(self.SEED)
+        self.temp_dir = tempfile.mkdtemp()
+        self.output_path = Path(self.temp_dir)
+
+    def tearDown(self):
+        """Clean up temporary files."""
+        shutil.rmtree(self.temp_dir, ignore_errors=True)
+
+    def assertAlmostEqualFloat(self, first, second, tolerance=1e-6, msg=None):
+        """Helper for floating point comparisons."""
+        if abs(first - second) > tolerance:
+            self.fail(f"{first} != {second} within {tolerance}: {msg}")
+
+
+class TestIntegration(RewardSpaceTestBase):
+    """Integration tests for CLI and file outputs."""
+
+    def test_cli_execution_produces_expected_files(self):
+        """Test that CLI execution produces all expected output files."""
+        cmd = [
+            sys.executable,
+            "reward_space_analysis.py",
+            "--num_samples",
+            str(self.TEST_SAMPLES),
+            "--seed",
+            str(self.SEED),
+            "--output",
+            str(self.output_path),
+        ]
+
+        result = subprocess.run(
+            cmd, capture_output=True, text=True, cwd=Path(__file__).parent
+        )
+
+        # Should execute successfully
+        self.assertEqual(result.returncode, 0, f"CLI failed: {result.stderr}")
+
+        # Should produce expected files (single source of truth list)
+        expected_files = [
+            "reward_samples.csv",
+            "feature_importance.csv",
+            "statistical_analysis.md",
+            "manifest.json",
+            "partial_dependence_trade_duration.csv",
+            "partial_dependence_idle_duration.csv",
+            "partial_dependence_pnl.csv",
+        ]
+
+        for filename in expected_files:
+            file_path = self.output_path / filename
+            self.assertTrue(file_path.exists(), f"Missing expected file: {filename}")
+
+    def test_manifest_structure_and_reproducibility(self):
+        """Test manifest structure and reproducibility with same seed."""
+        # First run
+        cmd1 = [
+            sys.executable,
+            "reward_space_analysis.py",
+            "--num_samples",
+            str(self.TEST_SAMPLES),
+            "--seed",
+            str(self.SEED),
+            "--output",
+            str(self.output_path / "run1"),
+        ]
+
+        # Second run with same seed
+        cmd2 = [
+            sys.executable,
+            "reward_space_analysis.py",
+            "--num_samples",
+            str(self.TEST_SAMPLES),
+            "--seed",
+            str(self.SEED),
+            "--output",
+            str(self.output_path / "run2"),
+        ]
+
+        # Execute both runs
+        result1 = subprocess.run(
+            cmd1, capture_output=True, text=True, cwd=Path(__file__).parent
+        )
+        result2 = subprocess.run(
+            cmd2, capture_output=True, text=True, cwd=Path(__file__).parent
+        )
+
+        self.assertEqual(result1.returncode, 0)
+        self.assertEqual(result2.returncode, 0)
+
+        # Validate manifest structure
+        for run_dir in ["run1", "run2"]:
+            with open(self.output_path / run_dir / "manifest.json", "r") as f:
+                manifest = json.load(f)
+
+            required_keys = {"generated_at", "num_samples", "seed", "params_hash"}
+            self.assertTrue(required_keys.issubset(manifest.keys()))
+            self.assertEqual(manifest["num_samples"], self.TEST_SAMPLES)
+            self.assertEqual(manifest["seed"], self.SEED)
+
+        # Test reproducibility: manifests should have same params_hash
+        with open(self.output_path / "run1" / "manifest.json", "r") as f:
+            manifest1 = json.load(f)
+        with open(self.output_path / "run2" / "manifest.json", "r") as f:
+            manifest2 = json.load(f)
+
+        self.assertEqual(
+            manifest1["params_hash"],
+            manifest2["params_hash"],
+            "Same seed should produce same parameters hash",
+        )
+
+
+class TestStatisticalCoherence(RewardSpaceTestBase):
+    """Statistical coherence validation tests."""
+
+    def _make_test_dataframe(self, n: int = 100) -> pd.DataFrame:
+        """Generate test dataframe for statistical validation."""
+        np.random.seed(self.SEED)
+
+        # Create correlated data for Spearman test
+        idle_duration = np.random.exponential(10, n)
+        reward_idle = -0.01 * idle_duration + np.random.normal(0, 0.001, n)
+
+        return pd.DataFrame(
+            {
+                "idle_duration": idle_duration,
+                "reward_idle": reward_idle,
+                "position": np.random.choice([0.0, 0.5, 1.0], n),
+                "reward_total": np.random.normal(0, 1, n),
+                "pnl": np.random.normal(0, 0.02, n),
+                "trade_duration": np.random.exponential(20, n),
+            }
+        )
+
+    def test_distribution_shift_metrics(self):
+        """Test KL divergence, JS distance, Wasserstein distance calculations."""
+        df1 = self._make_test_dataframe(100)
+        df2 = self._make_test_dataframe(100)
+
+        # Add slight shift to second dataset
+        df2["reward_total"] += 0.1
+
+        metrics = compute_distribution_shift_metrics(df1, df2)
+
+        # Should contain expected metrics for pnl (continuous feature)
+        expected_keys = {
+            "pnl_kl_divergence",
+            "pnl_js_distance",
+            "pnl_wasserstein",
+            "pnl_ks_statistic",
+        }
+        actual_keys = set(metrics.keys())
+        matching_keys = expected_keys.intersection(actual_keys)
+        self.assertGreater(
+            len(matching_keys),
+            0,
+            f"Should have some distribution metrics. Got: {actual_keys}",
+        )
+
+        # Values should be finite and reasonable
+        for metric_name, value in metrics.items():
+            if "pnl" in metric_name:
+                self.assertTrue(np.isfinite(value), f"{metric_name} should be finite")
+                if any(
+                    suffix in metric_name for suffix in ["js_distance", "ks_statistic"]
+                ):
+                    self.assertGreaterEqual(
+                        value, 0, f"{metric_name} should be non-negative"
+                    )
+
+    def test_hypothesis_testing(self):
+        """Test statistical hypothesis tests."""
+        df = self._make_test_dataframe(200)
+
+        # Test only if we have enough data - simple correlation validation
+        if len(df) > 30:
+            idle_data = df[df["idle_duration"] > 0]
+            if len(idle_data) > 10:
+                # Simple correlation check: idle duration should correlate negatively with idle reward
+                idle_dur = idle_data["idle_duration"].values
+                idle_rew = idle_data["reward_idle"].values
+
+                # Basic validation that data makes sense
+                self.assertTrue(
+                    len(idle_dur) == len(idle_rew),
+                    "Idle duration and reward arrays should have same length",
+                )
+                self.assertTrue(
+                    all(d >= 0 for d in idle_dur),
+                    "Idle durations should be non-negative",
+                )
+
+                # Idle rewards should generally be negative (penalty for holding)
+                negative_rewards = (idle_rew < 0).sum()
+                total_rewards = len(idle_rew)
+                negative_ratio = negative_rewards / total_rewards
+
+                self.assertGreater(
+                    negative_ratio,
+                    0.5,
+                    "Most idle rewards should be negative (penalties)",
+                )
+
+    def test_distribution_diagnostics(self):
+        """Test distribution normality diagnostics."""
+        df = self._make_test_dataframe(100)
+
+        diagnostics = distribution_diagnostics(df)
+
+        # Should contain diagnostics for key columns (flattened format)
+        expected_prefixes = ["reward_total_", "pnl_"]
+        for prefix in expected_prefixes:
+            matching_keys = [
+                key for key in diagnostics.keys() if key.startswith(prefix)
+            ]
+            self.assertGreater(
+                len(matching_keys), 0, f"Should have diagnostics for {prefix}"
+            )
+
+            # Check for basic statistical measures
+            expected_suffixes = ["mean", "std", "skewness", "kurtosis"]
+            for suffix in expected_suffixes:
+                key = f"{prefix}{suffix}"
+                if key in diagnostics:
+                    self.assertTrue(
+                        np.isfinite(diagnostics[key]), f"{key} should be finite"
+                    )
+
+
+class TestRewardAlignment(RewardSpaceTestBase):
+    """Test reward calculation alignment with environment."""
+
+    def test_basic_reward_calculation(self):
+        """Test basic reward calculation consistency."""
+        context = RewardContext(
+            pnl=0.02,
+            trade_duration=10,
+            idle_duration=0,
+            max_trade_duration=100,
+            max_unrealized_profit=0.025,
+            min_unrealized_profit=0.015,
+            position=Positions.Long,
+            action=Actions.Long_exit,
+            force_action=None,
+        )
+
+        breakdown = calculate_reward(
+            context,
+            self.DEFAULT_PARAMS,
+            base_factor=100.0,
+            profit_target=0.06,
+            risk_reward_ratio=2.0,
+            short_allowed=True,
+            action_masking=True,
+        )
+
+        # Should return valid breakdown
+        self.assertIsInstance(breakdown.total, (int, float))
+        self.assertTrue(np.isfinite(breakdown.total))
+
+        # Exit reward should be positive for profitable trade
+        self.assertGreater(
+            breakdown.exit_component, 0, "Profitable exit should have positive reward"
+        )
+
+    def test_force_action_logic(self):
+        """Test force action behavior consistency."""
+        # Take profit should generally be positive
+        tp_context = RewardContext(
+            pnl=0.05,  # Good profit
+            trade_duration=20,
+            idle_duration=0,
+            max_trade_duration=100,
+            max_unrealized_profit=0.06,
+            min_unrealized_profit=0.04,
+            position=Positions.Long,
+            action=Actions.Long_exit,
+            force_action=ForceActions.Take_profit,
+        )
+
+        tp_breakdown = calculate_reward(
+            tp_context,
+            self.DEFAULT_PARAMS,
+            base_factor=100.0,
+            profit_target=0.06,
+            risk_reward_ratio=2.0,
+            short_allowed=True,
+            action_masking=True,
+        )
+
+        # Stop loss should generally be negative
+        sl_context = RewardContext(
+            pnl=-0.03,  # Loss
+            trade_duration=15,
+            idle_duration=0,
+            max_trade_duration=100,
+            max_unrealized_profit=0.01,
+            min_unrealized_profit=-0.04,
+            position=Positions.Long,
+            action=Actions.Long_exit,
+            force_action=ForceActions.Stop_loss,
+        )
+
+        sl_breakdown = calculate_reward(
+            sl_context,
+            self.DEFAULT_PARAMS,
+            base_factor=100.0,
+            profit_target=0.06,
+            risk_reward_ratio=2.0,
+            short_allowed=True,
+            action_masking=True,
+        )
+
+        # Take profit should be better than stop loss
+        self.assertGreater(
+            tp_breakdown.total,
+            sl_breakdown.total,
+            "Take profit should yield higher reward than stop loss",
+        )
+
+    def test_exit_factor_calculation(self):
+        """Test exit factor calculation consistency."""
+        # Test different exit factor modes
+        modes_to_test = ["linear", "piecewise", "power"]
+
+        for mode in modes_to_test:
+            test_params = self.DEFAULT_PARAMS.copy()
+            test_params["exit_factor_mode"] = mode
+
+            factor = rsa._get_exit_factor(
+                factor=1.0,
+                pnl=0.02,
+                pnl_factor=1.5,
+                duration_ratio=0.3,
+                params=test_params,
+            )
+
+            self.assertTrue(
+                np.isfinite(factor), f"Exit factor for {mode} should be finite"
+            )
+            self.assertGreater(factor, 0, f"Exit factor for {mode} should be positive")
+
+
+class TestPublicFunctions(RewardSpaceTestBase):
+    """Test public functions and API."""
+
+    def test_parse_overrides(self):
+        """Test parse_overrides function."""
+        # Test valid overrides
+        overrides = ["key1=1.5", "key2=hello", "key3=42"]
+        result = parse_overrides(overrides)
+
+        self.assertEqual(result["key1"], 1.5)
+        self.assertEqual(result["key2"], "hello")
+        self.assertEqual(result["key3"], 42.0)
+
+        # Test invalid format
+        with self.assertRaises(ValueError):
+            parse_overrides(["invalid_format"])
+
+    def test_bootstrap_confidence_intervals(self):
+        """Test bootstrap confidence intervals calculation."""
+        # Create test data
+        np.random.seed(42)
+        test_data = pd.DataFrame(
+            {
+                "reward_total": np.random.normal(0, 1, 100),
+                "pnl": np.random.normal(0.01, 0.02, 100),
+            }
+        )
+
+        results = bootstrap_confidence_intervals(
+            test_data,
+            ["reward_total", "pnl"],
+            n_bootstrap=100,  # Small for speed
+        )
+
+        self.assertIn("reward_total", results)
+        self.assertIn("pnl", results)
+
+        for metric, (mean, ci_low, ci_high) in results.items():
+            self.assertTrue(np.isfinite(mean), f"Mean for {metric} should be finite")
+            self.assertTrue(
+                np.isfinite(ci_low), f"CI low for {metric} should be finite"
+            )
+            self.assertTrue(
+                np.isfinite(ci_high), f"CI high for {metric} should be finite"
+            )
+            self.assertLess(
+                ci_low, ci_high, f"CI bounds for {metric} should be ordered"
+            )
+
+    def test_statistical_hypothesis_tests_seed_reproducibility(self):
+        """Ensure statistical_hypothesis_tests + bootstrap CIs are reproducible with stats_seed."""
+        from reward_space_analysis import (
+            statistical_hypothesis_tests,
+            bootstrap_confidence_intervals,
+        )
+
+        np.random.seed(123)
+        # Create idle_duration with variability throughout to avoid constant Spearman warnings
+        idle_base = np.random.uniform(3, 40, 300)
+        idle_mask = np.random.rand(300) < 0.4
+        idle_duration = (
+            idle_base * idle_mask
+        )  # some zeros but not fully constant subset
+        df = pd.DataFrame(
+            {
+                "reward_total": np.random.normal(0, 1, 300),
+                "reward_idle": np.where(idle_mask, np.random.normal(-1, 0.3, 300), 0.0),
+                "reward_holding": np.where(
+                    ~idle_mask, np.random.normal(-0.5, 0.2, 300), 0.0
+                ),
+                "reward_exit": np.random.normal(0.8, 0.6, 300),
+                "pnl": np.random.normal(0.01, 0.02, 300),
+                "trade_duration": np.random.uniform(5, 150, 300),
+                "idle_duration": idle_duration,
+                "position": np.random.choice([0.0, 0.5, 1.0], 300),
+                "is_force_exit": np.random.choice([0.0, 1.0], 300, p=[0.85, 0.15]),
+            }
+        )
+
+        # Two runs with same seed
+        r1 = statistical_hypothesis_tests(df, seed=777)
+        r2 = statistical_hypothesis_tests(df, seed=777)
+        self.assertEqual(set(r1.keys()), set(r2.keys()))
+        for k in r1:
+            for field in ("p_value", "significant"):
+                v1 = r1[k][field]
+                v2 = r2[k][field]
+                if (
+                    isinstance(v1, float)
+                    and isinstance(v2, float)
+                    and (np.isnan(v1) and np.isnan(v2))
+                ):
+                    continue
+                self.assertEqual(v1, v2, f"Mismatch for {k}:{field}")
+
+        # Different seed may change bootstrap CI of rho if present
+        r3 = statistical_hypothesis_tests(df, seed=888)
+        if "idle_correlation" in r1 and "idle_correlation" in r3:
+            ci1 = r1["idle_correlation"]["ci_95"]
+            ci3 = r3["idle_correlation"]["ci_95"]
+            # Not guaranteed different, but often; only assert shape
+            self.assertEqual(len(ci1), 2)
+            self.assertEqual(len(ci3), 2)
+
+        # Test bootstrap reproducibility path
+        metrics = ["reward_total", "pnl"]
+        ci_a = bootstrap_confidence_intervals(df, metrics, n_bootstrap=250, seed=2024)
+        ci_b = bootstrap_confidence_intervals(df, metrics, n_bootstrap=250, seed=2024)
+        self.assertEqual(ci_a, ci_b)
+
+    def test_pnl_invariant_validation(self):
+        """Test critical PnL invariant: only exit actions should have non-zero PnL."""
+        # Generate samples and verify PnL invariant
+        df = simulate_samples(
+            num_samples=200,
+            seed=42,
+            params=self.DEFAULT_PARAMS,
+            max_trade_duration=50,
+            base_factor=100.0,
+            profit_target=0.03,
+            risk_reward_ratio=1.0,
+            holding_max_ratio=2.0,
+            trading_mode="margin",
+            pnl_base_std=0.02,
+            pnl_duration_vol_scale=0.5,
+        )
+
+        # Critical invariant: Total PnL must equal sum of exit PnL
+        total_pnl = df["pnl"].sum()
+        exit_mask = df["reward_exit"] != 0
+        exit_pnl_sum = df.loc[exit_mask, "pnl"].sum()
+
+        self.assertAlmostEqual(
+            total_pnl,
+            exit_pnl_sum,
+            places=10,
+            msg="PnL invariant violation: total PnL != sum of exit PnL",
+        )
+
+        # Only exit actions (2.0, 4.0) should have non-zero PnL
+        non_zero_pnl_actions = set(df[df["pnl"] != 0]["action"].unique())
+        expected_exit_actions = {2.0, 4.0}  # Long_exit, Short_exit
+
+        self.assertTrue(
+            non_zero_pnl_actions.issubset(expected_exit_actions),
+            f"Non-exit actions have PnL: {non_zero_pnl_actions - expected_exit_actions}",
+        )
+
+        # No actions should have zero PnL but non-zero exit reward
+        invalid_combinations = df[(df["pnl"] == 0) & (df["reward_exit"] != 0)]
+        self.assertEqual(
+            len(invalid_combinations),
+            0,
+            "Found actions with zero PnL but non-zero exit reward",
+        )
+
+    def test_distribution_metrics_mathematical_bounds(self):
+        """Test mathematical bounds and validity of distribution shift metrics."""
+        # Create two different distributions for testing
+        np.random.seed(42)
+        df1 = pd.DataFrame(
+            {
+                "pnl": np.random.normal(0, 0.02, 500),
+                "trade_duration": np.random.exponential(30, 500),
+                "idle_duration": np.random.gamma(2, 5, 500),
+            }
+        )
+
+        df2 = pd.DataFrame(
+            {
+                "pnl": np.random.normal(0.01, 0.025, 500),
+                "trade_duration": np.random.exponential(35, 500),
+                "idle_duration": np.random.gamma(2.5, 6, 500),
+            }
+        )
+
+        metrics = compute_distribution_shift_metrics(df1, df2)
+
+        # Validate mathematical bounds for each feature
+        for feature in ["pnl", "trade_duration", "idle_duration"]:
+            # KL divergence must be >= 0
+            kl_key = f"{feature}_kl_divergence"
+            if kl_key in metrics:
+                self.assertGreaterEqual(
+                    metrics[kl_key], 0, f"KL divergence for {feature} must be >= 0"
+                )
+
+            # JS distance must be in [0, 1]
+            js_key = f"{feature}_js_distance"
+            if js_key in metrics:
+                js_val = metrics[js_key]
+                self.assertGreaterEqual(
+                    js_val, 0, f"JS distance for {feature} must be >= 0"
+                )
+                self.assertLessEqual(
+                    js_val, 1, f"JS distance for {feature} must be <= 1"
+                )
+
+            # Wasserstein must be >= 0
+            ws_key = f"{feature}_wasserstein"
+            if ws_key in metrics:
+                self.assertGreaterEqual(
+                    metrics[ws_key],
+                    0,
+                    f"Wasserstein distance for {feature} must be >= 0",
+                )
+
+            # KS statistic must be in [0, 1]
+            ks_stat_key = f"{feature}_ks_statistic"
+            if ks_stat_key in metrics:
+                ks_val = metrics[ks_stat_key]
+                self.assertGreaterEqual(
+                    ks_val, 0, f"KS statistic for {feature} must be >= 0"
+                )
+                self.assertLessEqual(
+                    ks_val, 1, f"KS statistic for {feature} must be <= 1"
+                )
+
+            # KS p-value must be in [0, 1]
+            ks_p_key = f"{feature}_ks_pvalue"
+            if ks_p_key in metrics:
+                p_val = metrics[ks_p_key]
+                self.assertGreaterEqual(
+                    p_val, 0, f"KS p-value for {feature} must be >= 0"
+                )
+                self.assertLessEqual(p_val, 1, f"KS p-value for {feature} must be <= 1")
+
+    def test_heteroscedasticity_pnl_validation(self):
+        """Test that PnL variance increases with trade duration (heteroscedasticity)."""
+        # Generate larger sample for statistical power
+        df = simulate_samples(
+            num_samples=1000,
+            seed=123,
+            params=self.DEFAULT_PARAMS,
+            max_trade_duration=100,
+            base_factor=100.0,
+            profit_target=0.03,
+            risk_reward_ratio=1.0,
+            holding_max_ratio=2.0,
+            trading_mode="margin",
+            pnl_base_std=0.02,
+            pnl_duration_vol_scale=0.5,
+        )
+
+        # Filter to exit actions only (where PnL is meaningful)
+        exit_data = df[df["reward_exit"] != 0].copy()
+
+        if len(exit_data) < 50:
+            self.skipTest("Insufficient exit actions for heteroscedasticity test")
+
+        # Create duration bins and test variance pattern
+        exit_data["duration_bin"] = pd.cut(
+            exit_data["duration_ratio"], bins=4, labels=["Q1", "Q2", "Q3", "Q4"]
+        )
+
+        variance_by_bin = exit_data.groupby("duration_bin")["pnl"].var().dropna()
+
+        # Should see increasing variance with duration (not always strict monotonic due to sampling)
+        # Test that Q4 variance > Q1 variance (should hold with heteroscedastic model)
+        if "Q1" in variance_by_bin.index and "Q4" in variance_by_bin.index:
+            self.assertGreater(
+                variance_by_bin["Q4"],
+                variance_by_bin["Q1"] * 0.8,  # Allow some tolerance
+                "PnL heteroscedasticity: variance should increase with duration",
+            )
+
+    def test_exit_factor_mathematical_formulas(self):
+        """Test mathematical correctness of exit factor calculations."""
+        import math
+        from reward_space_analysis import (
+            calculate_reward,
+            RewardContext,
+            Actions,
+            Positions,
+        )
+
+        # Test context with known values
+        context = RewardContext(
+            pnl=0.05,
+            trade_duration=50,
+            idle_duration=0,
+            max_trade_duration=100,
+            max_unrealized_profit=0.06,
+            min_unrealized_profit=0.04,
+            position=Positions.Long,
+            action=Actions.Long_exit,
+            force_action=None,
+        )
+
+        params = self.DEFAULT_PARAMS.copy()
+        duration_ratio = 50 / 100  # 0.5
+
+        # Test power mode with known tau
+        params["exit_factor_mode"] = "power"
+        params["exit_power_tau"] = 0.5
+
+        reward_power = calculate_reward(
+            context, params, 100.0, 0.03, 1.0, short_allowed=True, action_masking=True
+        )
+
+        # Mathematical validation: alpha = -ln(tau) = -ln(0.5) ≈ 0.693
+        expected_alpha = -math.log(0.5)
+        expected_attenuation = 1.0 / (1.0 + expected_alpha * duration_ratio)
+
+        # The reward should be attenuated by this factor
+        self.assertGreater(
+            reward_power.exit_component, 0, "Power mode should generate positive reward"
+        )
+
+        # Test half_life mode
+        params["exit_factor_mode"] = "half_life"
+        params["exit_half_life"] = 0.5
+
+        reward_half_life = calculate_reward(
+            context, params, 100.0, 0.03, 1.0, short_allowed=True, action_masking=True
+        )
+
+        # Mathematical validation: 2^(-duration_ratio/half_life) = 2^(-0.5/0.5) = 0.5
+        expected_half_life_factor = 2 ** (-duration_ratio / 0.5)
+        self.assertAlmostEqual(expected_half_life_factor, 0.5, places=6)
+
+        # Test that different modes produce different results (mathematical diversity)
+        params["exit_factor_mode"] = "linear"
+        params["exit_linear_slope"] = 1.0
+
+        reward_linear = calculate_reward(
+            context, params, 100.0, 0.03, 1.0, short_allowed=True, action_masking=True
+        )
+
+        # All modes should produce positive rewards but different values
+        self.assertGreater(reward_power.exit_component, 0)
+        self.assertGreater(reward_half_life.exit_component, 0)
+        self.assertGreater(reward_linear.exit_component, 0)
+
+        # They should be different (mathematical distinctness)
+        rewards = [
+            reward_power.exit_component,
+            reward_half_life.exit_component,
+            reward_linear.exit_component,
+        ]
+        unique_rewards = set(f"{r:.6f}" for r in rewards)
+        self.assertGreater(
+            len(unique_rewards),
+            1,
+            "Different exit factor modes should produce different rewards",
+        )
+
+    def test_statistical_functions_bounds_validation(self):
+        """Test that all statistical functions respect mathematical bounds."""
+        # Create test data with known statistical properties
+        np.random.seed(42)
+        df = pd.DataFrame(
+            {
+                "reward_total": np.random.normal(0, 1, 300),
+                "reward_idle": np.random.normal(-1, 0.5, 300),
+                "reward_holding": np.random.normal(-0.5, 0.3, 300),
+                "reward_exit": np.random.normal(1, 0.8, 300),
+                "pnl": np.random.normal(0.01, 0.02, 300),
+                "trade_duration": np.random.uniform(5, 150, 300),
+                "idle_duration": np.random.uniform(0, 100, 300),
+                "position": np.random.choice([0.0, 0.5, 1.0], 300),
+                "is_force_exit": np.random.choice([0.0, 1.0], 300, p=[0.8, 0.2]),
+            }
+        )
+
+        # Test distribution diagnostics bounds
+        diagnostics = distribution_diagnostics(df)
+
+        for col in ["reward_total", "pnl", "trade_duration", "idle_duration"]:
+            # Skewness can be any real number, but should be finite
+            if f"{col}_skewness" in diagnostics:
+                skew = diagnostics[f"{col}_skewness"]
+                self.assertTrue(
+                    np.isfinite(skew), f"Skewness for {col} should be finite"
+                )
+
+            # Kurtosis should be finite (can be negative for platykurtic distributions)
+            if f"{col}_kurtosis" in diagnostics:
+                kurt = diagnostics[f"{col}_kurtosis"]
+                self.assertTrue(
+                    np.isfinite(kurt), f"Kurtosis for {col} should be finite"
+                )
+
+            # Shapiro p-value must be in [0, 1]
+            if f"{col}_shapiro_pval" in diagnostics:
+                p_val = diagnostics[f"{col}_shapiro_pval"]
+                self.assertGreaterEqual(
+                    p_val, 0, f"Shapiro p-value for {col} must be >= 0"
+                )
+                self.assertLessEqual(
+                    p_val, 1, f"Shapiro p-value for {col} must be <= 1"
+                )
+
+        # Test hypothesis tests results bounds
+        from reward_space_analysis import statistical_hypothesis_tests
+
+        hypothesis_results = statistical_hypothesis_tests(df, seed=42)
+
+        for test_name, result in hypothesis_results.items():
+            # All p-values must be in [0, 1]
+            if "p_value" in result:
+                p_val = result["p_value"]
+                self.assertGreaterEqual(
+                    p_val, 0, f"p-value for {test_name} must be >= 0"
+                )
+                self.assertLessEqual(p_val, 1, f"p-value for {test_name} must be <= 1")
+
+            # Effect sizes should be finite and meaningful
+            if "effect_size_epsilon_sq" in result:
+                effect_size = result["effect_size_epsilon_sq"]
+                self.assertTrue(
+                    np.isfinite(effect_size),
+                    f"Effect size for {test_name} should be finite",
+                )
+                self.assertGreaterEqual(
+                    effect_size, 0, f"ε² for {test_name} should be >= 0"
+                )
+
+            if "effect_size_rank_biserial" in result:
+                rb_corr = result["effect_size_rank_biserial"]
+                self.assertTrue(
+                    np.isfinite(rb_corr),
+                    f"Rank-biserial correlation for {test_name} should be finite",
+                )
+                self.assertGreaterEqual(
+                    rb_corr, -1, f"Rank-biserial for {test_name} should be >= -1"
+                )
+                self.assertLessEqual(
+                    rb_corr, 1, f"Rank-biserial for {test_name} should be <= 1"
+                )
+
+    def test_simulate_samples_with_different_modes(self):
+        """Test simulate_samples with different trading modes."""
+        # Test spot mode (no shorts)
+        df_spot = simulate_samples(
+            num_samples=100,
+            seed=42,
+            params=self.DEFAULT_PARAMS,
+            max_trade_duration=100,
+            base_factor=100.0,
+            profit_target=0.03,
+            risk_reward_ratio=1.0,
+            holding_max_ratio=2.0,
+            trading_mode="spot",
+            pnl_base_std=0.02,
+            pnl_duration_vol_scale=0.5,
+        )
+
+        # Should not have any short positions
+        short_positions = (df_spot["position"] == float(Positions.Short.value)).sum()
+        self.assertEqual(
+            short_positions, 0, "Spot mode should not have short positions"
+        )
+
+        # Test margin mode (shorts allowed)
+        df_margin = simulate_samples(
+            num_samples=100,
+            seed=42,
+            params=self.DEFAULT_PARAMS,
+            max_trade_duration=100,
+            base_factor=100.0,
+            profit_target=0.03,
+            risk_reward_ratio=1.0,
+            holding_max_ratio=2.0,
+            trading_mode="margin",
+            pnl_base_std=0.02,
+            pnl_duration_vol_scale=0.5,
+        )
+
+        # Should have required columns
+        required_columns = [
+            "pnl",
+            "trade_duration",
+            "idle_duration",
+            "position",
+            "action",
+            "reward_total",
+            "reward_invalid",
+            "reward_idle",
+            "reward_holding",
+            "reward_exit",
+        ]
+        for col in required_columns:
+            self.assertIn(col, df_margin.columns, f"Column {col} should be present")
+
+    def test_reward_calculation_comprehensive(self):
+        """Test comprehensive reward calculation scenarios."""
+        # Test different reward scenarios
+        test_cases = [
+            # (position, action, force_action, expected_reward_type)
+            (Positions.Neutral, Actions.Neutral, None, "idle_penalty"),
+            (Positions.Long, Actions.Long_exit, None, "exit_component"),
+            (Positions.Short, Actions.Short_exit, None, "exit_component"),
+            (
+                Positions.Long,
+                Actions.Neutral,
+                ForceActions.Take_profit,
+                "exit_component",
+            ),
+            (
+                Positions.Short,
+                Actions.Neutral,
+                ForceActions.Stop_loss,
+                "exit_component",
+            ),
+        ]
+
+        for position, action, force_action, expected_type in test_cases:
+            with self.subTest(
+                position=position, action=action, force_action=force_action
+            ):
+                context = RewardContext(
+                    pnl=0.02 if force_action == ForceActions.Take_profit else -0.02,
+                    trade_duration=50 if position != Positions.Neutral else 0,
+                    idle_duration=10 if position == Positions.Neutral else 0,
+                    max_trade_duration=100,
+                    max_unrealized_profit=0.03,
+                    min_unrealized_profit=-0.01,
+                    position=position,
+                    action=action,
+                    force_action=force_action,
+                )
+
+                breakdown = calculate_reward(
+                    context,
+                    self.DEFAULT_PARAMS,
+                    base_factor=100.0,
+                    profit_target=0.03,
+                    risk_reward_ratio=1.0,
+                    short_allowed=True,
+                    action_masking=True,
+                )
+
+                # Check that the appropriate component is non-zero
+                if expected_type == "idle_penalty":
+                    self.assertNotEqual(
+                        breakdown.idle_penalty,
+                        0.0,
+                        f"Expected idle penalty for {position}/{action}",
+                    )
+                elif expected_type == "exit_component":
+                    self.assertNotEqual(
+                        breakdown.exit_component,
+                        0.0,
+                        f"Expected exit component for {position}/{action}",
+                    )
+
+                # Total should always be finite
+                self.assertTrue(
+                    np.isfinite(breakdown.total),
+                    f"Reward total should be finite for {position}/{action}",
+                )
+
+
+class TestEdgeCases(RewardSpaceTestBase):
+    """Test edge cases and error conditions."""
+
+    def test_extreme_parameter_values(self):
+        """Test behavior with extreme parameter values."""
+        # Test with very large parameters
+        extreme_params = self.DEFAULT_PARAMS.copy()
+        extreme_params["win_reward_factor"] = 1000.0
+        extreme_params["base_factor"] = 10000.0
+
+        context = RewardContext(
+            pnl=0.05,
+            trade_duration=50,
+            idle_duration=0,
+            max_trade_duration=100,
+            max_unrealized_profit=0.06,
+            min_unrealized_profit=0.02,
+            position=Positions.Long,
+            action=Actions.Long_exit,
+            force_action=None,
+        )
+
+        breakdown = calculate_reward(
+            context,
+            extreme_params,
+            base_factor=10000.0,
+            profit_target=0.03,
+            risk_reward_ratio=1.0,
+            short_allowed=True,
+            action_masking=True,
+        )
+
+        self.assertTrue(
+            np.isfinite(breakdown.total),
+            "Reward should be finite even with extreme parameters",
+        )
+
+    def test_different_exit_factor_modes(self):
+        """Test different exit factor calculation modes."""
+        modes = ["legacy", "sqrt", "linear", "power", "piecewise", "half_life"]
+
+        for mode in modes:
+            with self.subTest(mode=mode):
+                test_params = self.DEFAULT_PARAMS.copy()
+                test_params["exit_factor_mode"] = mode
+
+                context = RewardContext(
+                    pnl=0.02,
+                    trade_duration=50,
+                    idle_duration=0,
+                    max_trade_duration=100,
+                    max_unrealized_profit=0.03,
+                    min_unrealized_profit=0.01,
+                    position=Positions.Long,
+                    action=Actions.Long_exit,
+                    force_action=None,
+                )
+
+                breakdown = calculate_reward(
+                    context,
+                    test_params,
+                    base_factor=100.0,
+                    profit_target=0.03,
+                    risk_reward_ratio=1.0,
+                    short_allowed=True,
+                    action_masking=True,
+                )
+
+                self.assertTrue(
+                    np.isfinite(breakdown.exit_component),
+                    f"Exit component should be finite for mode {mode}",
+                )
+                self.assertTrue(
+                    np.isfinite(breakdown.total),
+                    f"Total reward should be finite for mode {mode}",
+                )
+
+
+class TestUtilityFunctions(RewardSpaceTestBase):
+    """Test utility and helper functions."""
+
+    def test_to_bool_comprehensive(self):
+        """Test _to_bool with comprehensive inputs."""
+        # Test via simulate_samples which uses action_masking parameter
+        df1 = simulate_samples(
+            num_samples=10,
+            seed=42,
+            params={"action_masking": "true"},
+            max_trade_duration=50,
+            base_factor=100.0,
+            profit_target=0.03,
+            risk_reward_ratio=1.0,
+            holding_max_ratio=2.0,
+            trading_mode="spot",
+            pnl_base_std=0.02,
+            pnl_duration_vol_scale=0.5,
+        )
+        self.assertIsInstance(df1, pd.DataFrame)
+
+        df2 = simulate_samples(
+            num_samples=10,
+            seed=42,
+            params={"action_masking": "false"},
+            max_trade_duration=50,
+            base_factor=100.0,
+            profit_target=0.03,
+            risk_reward_ratio=1.0,
+            holding_max_ratio=2.0,
+            trading_mode="spot",
+            pnl_base_std=0.02,
+            pnl_duration_vol_scale=0.5,
+        )
+        self.assertIsInstance(df2, pd.DataFrame)
+
+    def test_short_allowed_via_simulation(self):
+        """Test _is_short_allowed via different trading modes."""
+        # Test futures mode (shorts allowed)
+        df_futures = simulate_samples(
+            num_samples=100,
+            seed=42,
+            params=self.DEFAULT_PARAMS,
+            max_trade_duration=50,
+            base_factor=100.0,
+            profit_target=0.03,
+            risk_reward_ratio=1.0,
+            holding_max_ratio=2.0,
+            trading_mode="futures",
+            pnl_base_std=0.02,
+            pnl_duration_vol_scale=0.5,
+        )
+
+        # Should have some short positions
+        short_positions = (df_futures["position"] == float(Positions.Short.value)).sum()
+        self.assertGreater(
+            short_positions, 0, "Futures mode should allow short positions"
+        )
+
+    def test_model_analysis_function(self):
+        """Test model_analysis function."""
+        from reward_space_analysis import model_analysis
+
+        # Create test data
+        test_data = simulate_samples(
+            num_samples=100,
+            seed=42,
+            params=self.DEFAULT_PARAMS,
+            max_trade_duration=50,
+            base_factor=100.0,
+            profit_target=0.03,
+            risk_reward_ratio=1.0,
+            holding_max_ratio=2.0,
+            trading_mode="spot",
+            pnl_base_std=0.02,
+            pnl_duration_vol_scale=0.5,
+        )
+
+        # Create temporary output directory
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            output_path = Path(tmp_dir)
+            model_analysis(test_data, output_path, seed=42)
+
+            # Check that feature importance file is created
+            feature_file = output_path / "feature_importance.csv"
+            self.assertTrue(
+                feature_file.exists(), "Feature importance file should be created"
+            )
+
+    def test_write_functions(self):
+        """Test various write functions."""
+        from reward_space_analysis import (
+            write_summary,
+            write_relationship_reports,
+            write_representativity_report,
+        )
+
+        # Create test data
+        test_data = simulate_samples(
+            num_samples=100,
+            seed=42,
+            params=self.DEFAULT_PARAMS,
+            max_trade_duration=50,
+            base_factor=100.0,
+            profit_target=0.03,
+            risk_reward_ratio=1.0,
+            holding_max_ratio=2.0,
+            trading_mode="spot",
+            pnl_base_std=0.02,
+            pnl_duration_vol_scale=0.5,
+        )
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            output_path = Path(tmp_dir)
+
+            # Test write_summary
+            write_summary(test_data, output_path)
+            summary_file = output_path / "reward_summary.md"
+            self.assertTrue(summary_file.exists(), "Summary file should be created")
+
+            # Test write_relationship_reports
+            write_relationship_reports(test_data, output_path, max_trade_duration=50)
+            relationship_file = output_path / "reward_relationships.md"
+            self.assertTrue(
+                relationship_file.exists(), "Relationship file should be created"
+            )
+
+            # Test write_representativity_report
+            write_representativity_report(
+                test_data, output_path, profit_target=0.03, max_trade_duration=50
+            )
+            repr_file = output_path / "representativity.md"
+            self.assertTrue(
+                repr_file.exists(), "Representativity file should be created"
+            )
+
+    def test_load_real_episodes(self):
+        """Test load_real_episodes function."""
+        from reward_space_analysis import load_real_episodes
+
+        # Create a temporary pickle file with test data
+        test_episodes = pd.DataFrame(
+            {
+                "pnl": [0.01, -0.02, 0.03],
+                "trade_duration": [10, 20, 15],
+                "idle_duration": [5, 0, 8],
+                "position": [1.0, 0.0, 1.0],
+                "action": [2.0, 0.0, 2.0],
+                "reward_total": [10.5, -5.2, 15.8],
+            }
+        )
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            pickle_path = Path(tmp_dir) / "test_episodes.pkl"
+            with pickle_path.open("wb") as f:
+                pickle.dump(test_episodes, f)  # Don't wrap in list
+
+            loaded_data = load_real_episodes(pickle_path)
+            self.assertIsInstance(loaded_data, pd.DataFrame)
+            self.assertEqual(len(loaded_data), 3)
+            self.assertIn("pnl", loaded_data.columns)
+
+    def test_statistical_functions_comprehensive(self):
+        """Test comprehensive statistical functions."""
+        from reward_space_analysis import (
+            statistical_hypothesis_tests,
+            write_enhanced_statistical_report,
+        )
+
+        # Create test data with specific patterns
+        np.random.seed(42)
+        test_data = pd.DataFrame(
+            {
+                "reward_total": np.random.normal(0, 1, 200),
+                "reward_idle": np.concatenate(
+                    [np.zeros(150), np.random.normal(-1, 0.5, 50)]
+                ),
+                "reward_holding": np.concatenate(
+                    [np.zeros(150), np.random.normal(-0.5, 0.3, 50)]
+                ),
+                "reward_exit": np.concatenate(
+                    [np.zeros(150), np.random.normal(2, 1, 50)]
+                ),
+                "pnl": np.random.normal(0.01, 0.02, 200),
+                "trade_duration": np.random.uniform(10, 100, 200),
+                "idle_duration": np.concatenate(
+                    [np.random.uniform(5, 50, 50), np.zeros(150)]
+                ),
+                "position": np.random.choice([0.0, 0.5, 1.0], 200),
+                "is_force_exit": np.random.choice([0.0, 1.0], 200, p=[0.8, 0.2]),
+            }
+        )
+
+        # Test hypothesis tests
+        results = statistical_hypothesis_tests(test_data)
+        self.assertIsInstance(results, dict)
+
+        # Test enhanced statistical report
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            output_path = Path(tmp_dir)
+            write_enhanced_statistical_report(test_data, output_path)
+            report_file = output_path / "enhanced_statistical_report.md"
+            self.assertTrue(
+                report_file.exists(), "Enhanced statistical report should be created"
+            )
+
+    def test_argument_parser_construction(self):
+        """Test build_argument_parser function."""
+        from reward_space_analysis import build_argument_parser
+
+        parser = build_argument_parser()
+        self.assertIsNotNone(parser)
+
+        # Test parsing with minimal arguments
+        args = parser.parse_args(["--num_samples", "100", "--output", "test_output"])
+        self.assertEqual(args.num_samples, 100)
+        self.assertEqual(str(args.output), "test_output")
+
+    def test_complete_statistical_analysis_writer(self):
+        """Test write_complete_statistical_analysis function."""
+        from reward_space_analysis import write_complete_statistical_analysis
+
+        # Create comprehensive test data
+        test_data = simulate_samples(
+            num_samples=200,
+            seed=42,
+            params=self.DEFAULT_PARAMS,
+            max_trade_duration=100,
+            base_factor=100.0,
+            profit_target=0.03,
+            risk_reward_ratio=1.0,
+            holding_max_ratio=2.0,
+            trading_mode="margin",
+            pnl_base_std=0.02,
+            pnl_duration_vol_scale=0.5,
+        )
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            output_path = Path(tmp_dir)
+
+            write_complete_statistical_analysis(
+                test_data,
+                output_path,
+                max_trade_duration=100,
+                profit_target=0.03,
+                seed=42,
+                real_df=None,
+            )
+
+            # Check that main report is created
+            main_report = output_path / "statistical_analysis.md"
+            self.assertTrue(
+                main_report.exists(), "Main statistical analysis should be created"
+            )
+
+            # Check for other expected files
+            feature_file = output_path / "feature_importance.csv"
+            self.assertTrue(
+                feature_file.exists(), "Feature importance should be created"
+            )
+
+
+class TestPrivateFunctionsViaPublicAPI(RewardSpaceTestBase):
+    """Test private functions through public API calls."""
+
+    def test_idle_penalty_via_rewards(self):
+        """Test idle penalty calculation via reward calculation."""
+        # Create context that will trigger idle penalty
+        context = RewardContext(
+            pnl=0.0,
+            trade_duration=0,
+            idle_duration=20,
+            max_trade_duration=100,
+            max_unrealized_profit=0.0,
+            min_unrealized_profit=0.0,
+            position=Positions.Neutral,
+            action=Actions.Neutral,
+            force_action=None,
+        )
+
+        breakdown = calculate_reward(
+            context,
+            self.DEFAULT_PARAMS,
+            base_factor=100.0,
+            profit_target=0.03,
+            risk_reward_ratio=1.0,
+            short_allowed=True,
+            action_masking=True,
+        )
+
+        self.assertLess(breakdown.idle_penalty, 0, "Idle penalty should be negative")
+        self.assertEqual(
+            breakdown.total, breakdown.idle_penalty, "Total should equal idle penalty"
+        )
+
+    def test_holding_penalty_via_rewards(self):
+        """Test holding penalty calculation via reward calculation."""
+        # Create context that will trigger holding penalty
+        context = RewardContext(
+            pnl=0.01,
+            trade_duration=150,
+            idle_duration=0,  # Long duration
+            max_trade_duration=100,
+            max_unrealized_profit=0.02,
+            min_unrealized_profit=0.0,
+            position=Positions.Long,
+            action=Actions.Neutral,
+            force_action=None,
+        )
+
+        breakdown = calculate_reward(
+            context,
+            self.DEFAULT_PARAMS,
+            base_factor=100.0,
+            profit_target=0.03,
+            risk_reward_ratio=1.0,
+            short_allowed=True,
+            action_masking=True,
+        )
+
+        self.assertLess(
+            breakdown.holding_penalty, 0, "Holding penalty should be negative"
+        )
+        self.assertEqual(
+            breakdown.total,
+            breakdown.holding_penalty,
+            "Total should equal holding penalty",
+        )
+
+    def test_exit_reward_calculation_comprehensive(self):
+        """Test exit reward calculation with various scenarios."""
+        scenarios = [
+            (Positions.Long, Actions.Long_exit, 0.05, "Profitable long exit"),
+            (Positions.Short, Actions.Short_exit, -0.03, "Profitable short exit"),
+            (Positions.Long, Actions.Long_exit, -0.02, "Losing long exit"),
+            (Positions.Short, Actions.Short_exit, 0.02, "Losing short exit"),
+        ]
+
+        for position, action, pnl, description in scenarios:
+            with self.subTest(description=description):
+                context = RewardContext(
+                    pnl=pnl,
+                    trade_duration=50,
+                    idle_duration=0,
+                    max_trade_duration=100,
+                    max_unrealized_profit=max(pnl + 0.01, 0.01),
+                    min_unrealized_profit=min(pnl - 0.01, -0.01),
+                    position=position,
+                    action=action,
+                    force_action=None,
+                )
+
+                breakdown = calculate_reward(
+                    context,
+                    self.DEFAULT_PARAMS,
+                    base_factor=100.0,
+                    profit_target=0.03,
+                    risk_reward_ratio=1.0,
+                    short_allowed=True,
+                    action_masking=True,
+                )
+
+                self.assertNotEqual(
+                    breakdown.exit_component,
+                    0.0,
+                    f"Exit component should be non-zero for {description}",
+                )
+                self.assertTrue(
+                    np.isfinite(breakdown.total),
+                    f"Total should be finite for {description}",
+                )
+
+    def test_invalid_action_handling(self):
+        """Test invalid action penalty."""
+        # Try to exit long when in short position (invalid)
+        context = RewardContext(
+            pnl=0.02,
+            trade_duration=50,
+            idle_duration=0,
+            max_trade_duration=100,
+            max_unrealized_profit=0.03,
+            min_unrealized_profit=0.01,
+            position=Positions.Short,
+            action=Actions.Long_exit,
+            force_action=None,  # Invalid: can't long_exit from short
+        )
+
+        breakdown = calculate_reward(
+            context,
+            self.DEFAULT_PARAMS,
+            base_factor=100.0,
+            profit_target=0.03,
+            risk_reward_ratio=1.0,
+            short_allowed=True,
+            action_masking=False,  # Disable masking to test invalid penalty
+        )
+
+        self.assertLess(
+            breakdown.invalid_penalty, 0, "Invalid action should have negative penalty"
+        )
+        self.assertEqual(
+            breakdown.total,
+            breakdown.invalid_penalty,
+            "Total should equal invalid penalty",
+        )
+
+
+if __name__ == "__main__":
+    # Configure test discovery and execution
+    loader = unittest.TestLoader()
+    suite = loader.loadTestsFromModule(sys.modules[__name__])
+
+    # Run tests with verbose output
+    runner = unittest.TextTestRunner(verbosity=2, buffer=True)
+    result = runner.run(suite)
+
+    # Exit with error code if tests failed
+    sys.exit(0 if result.wasSuccessful() else 1)
diff --git a/ReforceXY/reward_space_analysis/test_stat_coherence.py b/ReforceXY/reward_space_analysis/test_stat_coherence.py
deleted file mode 100644 (file)
index cc1c22f..0000000
+++ /dev/null
@@ -1,161 +0,0 @@
-#!/usr/bin/env python3
-"""Quick statistical coherence checks for reward_space_analysis.
-
-- Validates distribution shift metrics (KL, JS distance, Wasserstein, KS)
-- Validates hypothesis tests: Spearman (idle), Kruskal ε², Mann–Whitney rank-biserial
-- Validates QQ R^2 from distribution diagnostics
-"""
-
-from __future__ import annotations
-
-import numpy as np
-import pandas as pd
-from reward_space_analysis import (
-    compute_distribution_shift_metrics,
-    distribution_diagnostics,
-    statistical_hypothesis_tests,
-)
-
-
-def make_df_for_spearman(n: int = 300) -> pd.DataFrame:
-    rng = np.random.default_rng(42)
-    idle = np.linspace(0, 100, n)
-    # Strong negative relation: reward_idle decreases with idle_duration
-    reward_idle = -0.1 * idle + rng.normal(0, 0.5, size=n)
-    df = pd.DataFrame(
-        {
-            "idle_duration": idle,
-            "reward_idle": reward_idle,
-            # required fields (unused here)
-            "reward_total": reward_idle,  # some signal
-            "position": np.zeros(n),
-            "is_force_exit": np.zeros(n),
-            "reward_exit": np.zeros(n),
-            "pnl": np.zeros(n),
-            "trade_duration": np.zeros(n),
-        }
-    )
-    return df
-
-
-def make_df_for_kruskal(n_group: int = 100) -> pd.DataFrame:
-    rng = np.random.default_rng(123)
-    # three groups with different central tendencies
-    g1 = rng.normal(0.0, 1.0, size=n_group)
-    g2 = rng.normal(0.5, 1.0, size=n_group)
-    g3 = rng.normal(1.0, 1.0, size=n_group)
-    reward_total = np.concatenate([g1, g2, g3])
-    position = np.concatenate(
-        [np.full(n_group, 0.0), np.full(n_group, 0.5), np.full(n_group, 1.0)]
-    )
-    df = pd.DataFrame(
-        {
-            "reward_total": reward_total,
-            "position": position,
-            # fillers
-            "reward_idle": np.zeros(n_group * 3),
-            "is_force_exit": np.zeros(n_group * 3),
-            "reward_exit": np.zeros(n_group * 3),
-            "pnl": np.zeros(n_group * 3),
-            "trade_duration": np.zeros(n_group * 3),
-            "idle_duration": np.zeros(n_group * 3),
-        }
-    )
-    return df
-
-
-def make_df_for_mannwhitney(n: int = 200) -> pd.DataFrame:
-    rng = np.random.default_rng(77)
-    # force exits have larger rewards than regular exits
-    force_exit_rewards = rng.normal(1.0, 0.5, size=n)
-    regular_exit_rewards = rng.normal(0.2, 0.5, size=n)
-
-    df_force = pd.DataFrame(
-        {
-            "is_force_exit": np.ones(n),
-            "reward_exit": force_exit_rewards,
-        }
-    )
-    df_regular = pd.DataFrame(
-        {
-            "is_force_exit": np.zeros(n),
-            "reward_exit": regular_exit_rewards,
-        }
-    )
-    df = pd.concat([df_force, df_regular], ignore_index=True)
-    # fillers
-    df["reward_total"] = df["reward_exit"]
-    df["reward_idle"] = 0.0
-    df["position"] = 0.0
-    df["pnl"] = 0.0
-    df["trade_duration"] = 0.0
-    df["idle_duration"] = 0.0
-    return df
-
-
-def test_distribution_shift_metrics():
-    # Identical distributions should yield near-zero KL/JS and high KS p-value
-    rng = np.random.default_rng(2025)
-    X = rng.normal(0, 1, size=5000)
-    Y = rng.normal(0, 1, size=5000)
-
-    s_df = pd.DataFrame({"pnl": X, "trade_duration": X, "idle_duration": X})
-    r_df = pd.DataFrame({"pnl": Y, "trade_duration": Y, "idle_duration": Y})
-
-    metrics = compute_distribution_shift_metrics(s_df, r_df)
-
-    for feat in ["pnl", "trade_duration", "idle_duration"]:
-        assert metrics[f"{feat}_kl_divergence"] >= 0
-        assert metrics[f"{feat}_js_distance"] >= 0
-        # should be small for identical distributions (allowing small numerical noise)
-        assert metrics[f"{feat}_js_distance"] < 0.1
-        # KS should not reject (p > 0.05)
-        assert metrics[f"{feat}_ks_pvalue"] > 0.05
-
-
-def test_hypothesis_tests():
-    # Spearman negative correlation should be detected
-    df_s = make_df_for_spearman()
-    res_s = statistical_hypothesis_tests(df_s)
-    assert "idle_correlation" in res_s
-    assert res_s["idle_correlation"]["rho"] < 0
-    assert res_s["idle_correlation"]["p_value"] < 0.05
-
-    # Kruskal with 3 groups -> significant + epsilon^2 in [0, 1)
-    df_k = make_df_for_kruskal()
-    res_k = statistical_hypothesis_tests(df_k)
-    assert "position_reward_difference" in res_k
-    eps2 = res_k["position_reward_difference"]["effect_size_epsilon_sq"]
-    assert 0 <= eps2 < 1
-    assert res_k["position_reward_difference"]["p_value"] < 0.05
-
-    # Mann–Whitney rank-biserial should be positive since force > regular
-    df_m = make_df_for_mannwhitney()
-    res_m = statistical_hypothesis_tests(df_m)
-    assert "force_vs_regular_exits" in res_m
-    assert res_m["force_vs_regular_exits"]["effect_size_rank_biserial"] > 0
-    assert res_m["force_vs_regular_exits"]["p_value"] < 0.05
-
-
-def test_distribution_diagnostics():
-    rng = np.random.default_rng(99)
-    data = rng.normal(0, 1, size=2000)
-    df = pd.DataFrame(
-        {
-            "reward_total": data,
-            "pnl": data,
-            "trade_duration": data,
-            "idle_duration": data,
-        }
-    )
-    diag = distribution_diagnostics(df)
-    # QQ R^2 should be high for normal data
-    assert diag["reward_total_qq_r_squared"] > 0.97
-
-
-if __name__ == "__main__":
-    # Run tests sequentially
-    test_distribution_shift_metrics()
-    test_hypothesis_tests()
-    test_distribution_diagnostics()
-    print("All statistical coherence checks passed.")