next_potential: float = 0.0
# PBRS helpers
base_reward: float = 0.0
- pbrs_delta: float = 0.0 # Δ(s,s') = γ·Φ(s') − Φ(s)
+ pbrs_delta: float = 0.0 # Δ(s,a,s') = γ·Φ(s') − Φ(s)
invariance_correction: float = 0.0
)
base_reward = breakdown.invalid_penalty
- factor = _get_float_param(params, "base_factor", base_factor)
+ base_factor = _get_float_param(params, "base_factor", base_factor)
if "profit_aim" in params:
profit_aim = _get_float_param(params, "profit_aim", float(profit_aim))
pnl_target = float(profit_aim * risk_reward_ratio)
- idle_factor = factor * (profit_aim / 4.0)
+ idle_factor = base_factor * (profit_aim / risk_reward_ratio) / 4.0
hold_factor = idle_factor
max_trade_duration_candles = _get_int_param(
) or (context.action == Actions.Short_exit and context.position == Positions.Short)
if is_exit_action:
base_reward = _compute_exit_reward(
- factor,
+ base_factor,
pnl_target,
current_duration_ratio,
context,
breakdown.total = base_reward
return breakdown
- total_reward, reward_shaping, next_potential, pbrs_delta, entry_additive, exit_additive = (
- apply_potential_shaping(
- base_reward=base_reward,
+ reward_shaping, next_potential, pbrs_delta, entry_additive, exit_additive = (
+ compute_pbrs_components(
current_pnl=current_pnl,
pnl_target=pnl_target,
current_duration_ratio=current_duration_ratio,
breakdown.exit_additive = exit_additive
breakdown.pbrs_delta = pbrs_delta
breakdown.invariance_correction = reward_shaping - pbrs_delta
- breakdown.total = total_reward
+ breakdown.total = base_reward + reward_shaping + entry_additive + exit_additive
return breakdown
prev_potential = float(prev_potential) if np.isfinite(prev_potential) else 0.0
return float(next_potential)
-def apply_potential_shaping(
- base_reward: float,
+def compute_pbrs_components(
current_pnl: float,
pnl_target: float,
current_duration_ratio: float,
is_exit: bool = False,
is_entry: bool = False,
prev_potential: float,
-) -> tuple[float, float, float, float, float, float]:
- """Compute shaped reward using PBRS.
+) -> tuple[float, float, float, float, float]:
+ """Compute potential-based reward shaping (PBRS) components.
+
+ This function computes the PBRS shaping terms without combining them with the base reward,
+ allowing the caller to construct the total reward as R'(s,a,s') = R(s,a,s') + Δ(s,a,s') + additives.
+
+ Canonical PBRS Formula
+ ----------------------
+ R'(s,a,s') = R(s,a,s') + γ·Φ(s') - Φ(s)
+
+ where:
+ Δ(s,a,s') = γ·Φ(s') - Φ(s) (PBRS shaping term)
+
+ Notation
+ --------
+ **States & Actions:**
+ s : current state
+ s' : next state
+ a : action
+
+ **Reward Components:**
+ R(s,a,s') : base reward
+ R'(s,a,s') : shaped reward
+ Δ(s,a,s') : PBRS shaping term = γ·Φ(s') - Φ(s)
+
+ **Potential Function:**
+ Φ(s) : potential at state s
+ γ : discount factor for shaping (gamma)
+
+ **State Variables:**
+ r_pnl : pnl / pnl_target (PnL ratio)
+ r_dur : duration / max_duration (duration ratio, clamp [0,1])
+ g : gain parameter
+ T_x : transform function (tanh, softsign, etc.)
+
+ **Potential Formula:**
+ Φ(s) = scale · 0.5 · [T_pnl(g·r_pnl) + sgn(r_pnl)·T_dur(g·r_dur)]
+
+ PBRS Theory & Compliance
+ ------------------------
+ - Ng et al. 1999: potential-based shaping preserves optimal policy
+ - Wiewiora et al. 2003: terminal states must have Φ(terminal) = 0
+ - Invariance holds ONLY in canonical mode with additives disabled
+ - Theorem: Canonical + no additives ⇒ Σ_t γ^t·Δ_t = 0 over episodes
+
+ Architecture & Transitions
+ --------------------------
+ **Three mutually exclusive transition types:**
+
+ 1. **Entry** (Neutral → Long/Short):
+ - Φ(s) = 0 (neutral state has no potential)
+ - Φ(s') = hold_potential(s')
+ - Δ(s,a,s') = γ·Φ(s') - 0 = γ·Φ(s')
+ - Optional entry additive (breaks invariance)
+
+ 2. **Hold** (Long/Short → Long/Short):
+ - Φ(s) = hold_potential(s)
+ - Φ(s') = hold_potential(s')
+ - Δ(s,a,s') = γ·Φ(s') - Φ(s)
+ - Φ(s') reflects updated PnL and duration
+
+ 3. **Exit** (Long/Short → Neutral):
+ - Φ(s) = hold_potential(s)
+ - Φ(s') depends on exit_potential_mode:
+ * **canonical**: Φ(s') = 0 → Δ = -Φ(s)
+ * **heuristic**: Φ(s') = f(Φ(s)) → Δ = γ·Φ(s') - Φ(s)
+ - Optional exit additive (breaks invariance)
+
+ Exit Potential Modes
+ --------------------
+ **canonical** (PBRS-compliant):
+ Φ(s') = 0
+ Δ = γ·0 - Φ(s) = -Φ(s)
+ Additives disabled automatically
+
+ **non_canonical**:
+ Φ(s') = 0
+ Δ = -Φ(s)
+ Additives allowed (breaks invariance)
+
+ **progressive_release** (heuristic):
+ Φ(s') = Φ(s)·(1 - d) where d = decay_factor
+ Δ = γ·Φ(s)·(1-d) - Φ(s)
+
+ **spike_cancel** (heuristic):
+ Φ(s') = Φ(s)/γ
+ Δ = γ·(Φ(s)/γ) - Φ(s) = 0
+
+ **retain_previous** (heuristic):
+ Φ(s') = Φ(s)
+ Δ = γ·Φ(s) - Φ(s) = (γ-1)·Φ(s)
+
+ Additive Terms (Non-PBRS)
+ --------------------------
+ Entry and exit additives are **optional bonuses** that break PBRS invariance:
+ - Entry additive: applied on Neutral→Long/Short transitions
+ - Exit additive: applied on Long/Short→Neutral transitions
+ - These do NOT persist in Φ(s) storage
+
+ Invariance & Validation
+ -----------------------
+ **Theoretical Guarantee:**
+ Canonical + no additives ⇒ Σ_t γ^t·Δ_t = 0
+ (Φ(start) = Φ(end) = 0)
+
+ **Deviations from Theory:**
+ - Heuristic exit modes violate invariance
+ - Entry/exit additives break policy invariance
+ - Non-canonical modes introduce path dependence
+
+ **Robustness:**
+ - All transforms bounded: |T_x| ≤ 1
+ - Validation: |Φ(s)| ≤ scale
+ - Bounds: |Δ(s,a,s')| ≤ (1+γ)·scale
+ - Terminal enforcement: Φ(s) = 0 when terminated
+
+ Implementation Details
+ ----------------------
+ This is a stateless pure function for analysis and testing:
+ - All state (Φ(s), γ, configuration) passed explicitly as parameters
+ - Returns diagnostic values (next_potential, pbrs_delta) for inspection
+ - Does not mutate any inputs
+ - Suitable for batch processing and unit testing
+
+ For production RL environment use, see ReforceXY._compute_pbrs_components()
+ which wraps this logic with stateful management (self._last_potential, etc.)
+
+ Parameters
+ ----------
+ current_pnl : float
+ Current state s PnL
+ pnl_target : float
+ Target PnL for ratio normalization: r_pnl = pnl / pnl_target
+ current_duration_ratio : float
+ Current state s duration ratio [0,1]: r_dur = duration / max_duration
+ next_pnl : float
+ Next state s' PnL (after action)
+ next_duration_ratio : float
+ Next state s' duration ratio [0,1]
+ params : RewardParams
+ Configuration dictionary with keys:
+ - potential_gamma: γ (shaping discount factor)
+ - exit_potential_mode: "canonical" | "non_canonical" | heuristic modes
+ - hold_potential_enabled: enable/disable hold potential computation
+ - entry_additive_enabled, exit_additive_enabled: enable non-PBRS additives
+ - hold_potential_scale, hold_potential_gain, transforms, etc.
+ is_exit : bool, optional
+ True if this is an exit transition (Long/Short → Neutral)
+ is_entry : bool, optional
+ True if this is an entry transition (Neutral → Long/Short)
+ prev_potential : float
+ Φ(s) - potential at current state s (must be passed explicitly)
Returns
-------
- tuple[float, float, float, float, float, float]
- (reward, reward_shaping, next_potential, pbrs_delta, entry_additive, exit_additive)
- where pbrs_delta = gamma * next_potential - prev_potential is the pure PBRS component.
+ tuple[float, float, float, float, float]
+ (reward_shaping, next_potential, pbrs_delta, entry_additive, exit_additive)
+
+ - reward_shaping: Δ(s,a,s') = γ·Φ(s') - Φ(s), the PBRS shaping term
+ - next_potential: Φ(s') for next step (caller must store this)
+ - pbrs_delta: same as reward_shaping (diagnostic/compatibility)
+ - entry_additive: optional non-PBRS entry bonus (0.0 if disabled or not entry)
+ - exit_additive: optional non-PBRS exit bonus (0.0 if disabled or not exit)
Notes
-----
- - Shaping Δ = γ·Φ(next) − Φ(prev).
- - Φ(prev) must be provided explicitly as the stored potential carried across steps.
- This uses an explicit stored-potential value across steps.
- - Exit potential modes compute Φ(next) from Φ(prev).
- - Entry additive is applied only on entry transitions (based on next_* metrics).
- - Exit additive is applied only on exit transitions (based on current_* metrics).
-
- Note
- ----------------------
- Canonical mode is typically evaluated with additives disabled externally.
- This helper intentionally does not mutate `params`.
+ **State Management:**
+ - Caller is responsible for storing Φ(s') (returned as next_potential)
+ - No internal state; pure function
+
+ **Configuration:**
+ - All parameters read from params dict
+ - Use DEFAULT_MODEL_REWARD_PARAMETERS for defaults
+
+ **Recommendations:**
+ - Use canonical mode for policy-invariant shaping
+ - Monitor Σ_t γ^t·Δ_t ≈ 0 per episode in canonical mode
+ - Disable additives to preserve theoretical PBRS guarantees
+
+ **Validation:**
+ - Returns (0,0,0,0,0) if any output is non-finite
+ - Transform bounds ensure |Φ| ≤ scale
+
+ See Also
+ --------
+ ReforceXY._compute_pbrs_components : Stateful wrapper for RL environment
+ apply_potential_shaping : Deprecated wrapper that adds base_reward
"""
gamma = _get_potential_gamma(params)
if is_exit:
next_potential = _compute_exit_potential(prev_potential, params)
- # PBRS shaping Δ = γ·Φ(next) − Φ(prev)
pbrs_delta = gamma * next_potential - prev_potential
reward_shaping = pbrs_delta
else:
- # When hold potential is disabled, force Φ(next)=0 and emit no PBRS shaping on entry/hold.
if not hold_potential_enabled:
next_potential = 0.0
pbrs_delta = 0.0
next_potential = _compute_hold_potential(
next_pnl, pnl_target, next_duration_ratio, params
)
- # PBRS shaping Δ = γ·Φ(next) − Φ(prev)
pbrs_delta = gamma * next_potential - prev_potential
reward_shaping = pbrs_delta
- # Non-PBRS additives
if canonical_mode:
entry_additive = 0.0
exit_additive = 0.0
cand_exit_add = _compute_exit_additive(
current_pnl, pnl_target, current_duration_ratio, params
)
-
entry_additive = cand_entry_add if is_entry else 0.0
exit_additive = cand_exit_add if is_exit else 0.0
- reward = base_reward + reward_shaping + entry_additive + exit_additive
+ if not (
+ np.isfinite(reward_shaping)
+ and np.isfinite(next_potential)
+ and np.isfinite(pbrs_delta)
+ and np.isfinite(entry_additive)
+ and np.isfinite(exit_additive)
+ ):
+ return 0.0, 0.0, 0.0, 0.0, 0.0
+
+ return (
+ float(reward_shaping),
+ float(next_potential),
+ float(pbrs_delta),
+ float(entry_additive),
+ float(exit_additive),
+ )
+
+
+def apply_potential_shaping(
+ base_reward: float,
+ current_pnl: float,
+ pnl_target: float,
+ current_duration_ratio: float,
+ next_pnl: float,
+ next_duration_ratio: float,
+ params: RewardParams,
+ *,
+ is_exit: bool = False,
+ is_entry: bool = False,
+ prev_potential: float,
+) -> tuple[float, float, float, float, float, float]:
+ """Compute shaped reward and PBRS diagnostics.
+
+ .. deprecated::
+ This function exists only for backward compatibility with existing tests.
+ New code should use :func:`compute_pbrs_components` and compute the total reward manually.
+
+ This is a thin wrapper around `compute_pbrs_components()` that adds PBRS and
+ optional additive terms to the provided `base_reward`.
+
+ Returns
+ -------
+ tuple[float, float, float, float, float, float]
+ (reward, reward_shaping, next_potential, pbrs_delta, entry_additive, exit_additive)
+ """
+
+ reward_shaping, next_potential, pbrs_delta, entry_additive, exit_additive = (
+ compute_pbrs_components(
+ current_pnl,
+ pnl_target,
+ current_duration_ratio,
+ next_pnl,
+ next_duration_ratio,
+ params,
+ is_exit=is_exit,
+ is_entry=is_entry,
+ prev_potential=prev_potential,
+ )
+ )
+
+ reward = float(base_reward) + reward_shaping + entry_additive + exit_additive
if not np.isfinite(reward):
return float(base_reward), 0.0, 0.0, 0.0, 0.0, 0.0
) -> float:
"""Compute PBRS potential Φ(s) for position holding states.
- See ``_apply_potential_shaping`` for complete PBRS documentation.
+ See ``_compute_pbrs_components`` for PBRS documentation.
"""
return self._compute_pnl_duration_signal(
enabled=self._hold_potential_enabled,
) -> float:
"""Compute exit additive reward for position exit transitions.
- See ``_apply_potential_shaping`` for complete PBRS documentation.
+ See ``_compute_pbrs_components`` for PBRS documentation.
"""
return self._compute_pnl_duration_signal(
enabled=self._exit_additive_enabled,
) -> float:
"""Compute entry additive reward for position entry transitions.
- See ``_apply_potential_shaping`` for complete PBRS documentation.
+ See ``_compute_pbrs_components`` for PBRS documentation.
"""
return self._compute_pnl_duration_signal(
enabled=self._entry_additive_enabled,
def _compute_exit_potential(self, prev_potential: float, gamma: float) -> float:
"""Compute next potential Φ(s') for exit transitions based on exit potential mode.
- See ``_apply_potential_shaping`` for complete PBRS documentation.
+ See ``_compute_pbrs_components`` for PBRS documentation.
"""
mode = self._exit_potential_mode
# "canonical" or "non_canonical"
"""
return hold_potential_enabled and not add_state_info
- def _apply_potential_shaping(
+ def _compute_pbrs_components(
self,
- base_reward: float,
+ *,
action: int,
trade_duration: float,
max_trade_duration: float,
pnl: float,
pnl_target: float,
- ) -> float:
- """Apply potential-based reward shaping (PBRS) (Ng et al. 1999).
+ ) -> tuple[float, float, float]:
+ """Compute potential-based reward shaping (PBRS) components.
+
+ This method computes the PBRS shaping terms without combining them with the base reward,
+ allowing the caller to construct the total reward as R'(s,a,s') = R(s,a,s') + Δ(s,a,s') + additives.
+
+ Canonical PBRS Formula
+ ----------------------
+ R'(s,a,s') = R(s,a,s') + γ·Φ(s') - Φ(s)
- Canonical formula: R'(s,a,s') = R_base(s,a,s') + γ Φ(s') − Φ(s)
+ where:
+ Δ(s,a,s') = γ·Φ(s') - Φ(s) (PBRS shaping term)
Notation
--------
- R_base: base reward; Φ(s)/Φ(s'): potentials (prev/next); γ: shaping discount;
- Δ(s,s') = γΦ(s') − Φ(s); R' = R_base + Δ + optional additives; pnl_ratio = pnl/pnl_target;
- duration_ratio = trade_duration / max_trade_duration (clamped to [0,1]).
+ **States & Actions:**
+ s : current state
+ s' : next state
+ a : action
+
+ **Reward Components:**
+ R(s,a,s') : base reward
+ R'(s,a,s') : shaped reward
+ Δ(s,a,s') : PBRS shaping term = γ·Φ(s') - Φ(s)
+
+ **Potential Function:**
+ Φ(s) : potential at state s
+ γ : discount factor for shaping (gamma)
+
+ **State Variables:**
+ r_pnl : pnl / pnl_target (PnL ratio)
+ r_dur : duration / max_duration (duration ratio, clamp [0,1])
+ g : gain parameter
+ T_x : transform function (tanh, softsign, etc.)
+
+ **Potential Formula:**
+ Φ(s) = scale · 0.5 · [T_pnl(g·r_pnl) + sgn(r_pnl)·T_dur(g·r_dur)]
PBRS Theory & Compliance
------------------------
- - Ng et al. 1999 (potential-based shaping invariance)
- - Wiewiora et al. 2003 (Φ(terminal)=0 handling)
- - Invariance holds only in canonical mode with additives disabled.
+ - Ng et al. 1999: potential-based shaping preserves optimal policy
+ - Wiewiora et al. 2003: terminal states must have Φ(terminal) = 0
+ - Invariance holds ONLY in canonical mode with additives disabled
+ - Theorem: Canonical + no additives ⇒ Σ_t γ^t·Δ_t = 0 over episodes
Architecture & Transitions
--------------------------
- Three mutually exclusive transition types:
+ **Three mutually exclusive transition types:**
1. **Entry** (Neutral → Long/Short):
- - Initialize potential Φ for next step: Φ(s') = hold_potential(next_state)
- - PBRS shaping reward: γΦ(s') - Φ(s) where Φ(s)=0 (neutral has no potential)
- - Optional entry additive (non-PBRS additive term, breaks invariance if used)
+ - Φ(s) = 0 (neutral state has no potential)
+ - Φ(s') = hold_potential(s')
+ - Δ(s,a,s') = γ·Φ(s') - 0 = γ·Φ(s')
+ - Optional entry additive (breaks invariance)
2. **Hold** (Long/Short → Long/Short):
- - Standard PBRS: γΦ(s') - Φ(s) where both potentials computed from hold_potential()
- - Φ(s') accounts for updated PnL and trade duration progression
+ - Φ(s) = hold_potential(s)
+ - Φ(s') = hold_potential(s')
+ - Δ(s,a,s') = γ·Φ(s') - Φ(s)
+ - Φ(s') reflects updated PnL and duration
3. **Exit** (Long/Short → Neutral):
- - **Canonical mode**: Φ(terminal)=0, Δ(s,s') = -Φ(s)
- - **Heuristic modes**: Φ(s') computed by _compute_exit_potential(), Δ(s,s') = γΦ(s')-Φ(s)
- - Optional exit additive (non-PBRS additive term for trade quality summary)
-
- Potential Function Φ(s)
- -----------------------
- Φ(s) = scale * 0.5 * [T_pnl(g * pnl_ratio) + sign(pnl_ratio) * T_dur(g * duration_ratio)]
- Transforms (bounded in [-1,1]): tanh, softsign, arctan, sigmoid (≈ tanh(0.5x)), asinh, clip.
- Parameters: gain g (sharpens/softens), scale.
+ - Φ(s) = hold_potential(s)
+ - Φ(s') depends on exit_potential_mode:
+ * **canonical**: Φ(s') = 0 → Δ = -Φ(s)
+ * **heuristic**: Φ(s') = f(Φ(s)) → Δ = γ·Φ(s') - Φ(s)
+ - Optional exit additive (breaks invariance)
Exit Potential Modes
--------------------
**canonical** (PBRS-compliant):
- - Φ(s')=0 for all exit transitions
- - Maintains theoretical invariance guarantees
- - Shaping reward: γ·0-Φ(s) = -Φ(s)
- - Entry/exit additives automatically disabled to preserve invariance
+ Φ(s') = 0
+ Δ = γ·0 - Φ(s) = -Φ(s)
+ Additives disabled automatically
**non_canonical**:
- - Φ(s')=0 for all exit transitions
- - Entry/exit additives are allowed
+ Φ(s') = 0
+ Δ = -Φ(s)
+ Additives allowed (breaks invariance)
**progressive_release** (heuristic):
- - Φ(s')=Φ(s)*(1-decay_factor), gradual decay
- - Shaping reward: γΦ(s')-Φ(s) = γΦ(s)*(1-d)-Φ(s)
+ Φ(s') = Φ(s)·(1 - d) where d = decay_factor
+ Δ = γ·Φ(s)·(1-d) - Φ(s)
**spike_cancel** (heuristic):
- - Φ(s')=Φ(s)/γ (γ>0 finite)
- - Shaping reward: γΦ(s')-Φ(s) = γ*(Φ(s)/γ)-Φ(s) = 0
+ Φ(s') = Φ(s)/γ
+ Δ = γ·(Φ(s)/γ) - Φ(s) = 0
**retain_previous** (heuristic):
- - Φ(s')=Φ(s), full retention
- - Shaping reward: (γ-1)Φ(s)
-
- Additive Components & Path Dependence
- ------------------------------------
- **Entry/Exit Additive Terms**: Non-PBRS additive rewards that break invariance
- - Entry additive: Applied at entry transitions, computed via _compute_entry_additive()
- - Exit additive: Applied at exit transitions, computed via _compute_exit_additive()
- - Neither additive persists in stored potential (maintains neutrality)
+ Φ(s') = Φ(s)
+ Δ = γ·Φ(s) - Φ(s) = (γ-1)·Φ(s)
- **Path Dependence**: Only canonical preserves invariance; others introduce path dependence.
+ Additive Terms (Non-PBRS)
+ --------------------------
+ Entry and exit additives are **optional bonuses** that break PBRS invariance:
+ - Entry additive: applied on Neutral→Long/Short transitions
+ - Exit additive: applied on Long/Short→Neutral transitions
+ - These do NOT persist in Φ(s) storage
Invariance & Validation
-----------------------
- **Theoretical Guarantee**: Canonical + no additives ⇒ Σ_t γ^t Δ_t = 0 (Φ(start)=Φ(end)=0).
-
- **Deviations from Theory**:
- - Heuristic exit modes violate invariance
- - Entry/exit additives break policy invariance
- - Non-canonical modes may cause path-dependent learning
-
- **Robustness**:
- - Bounded transforms prevent potential explosion
- - Finite value validation with fallback to 0
- - Terminal state enforcement: Φ(s)=0 when terminated=True
- - All transform functions are strictly bounded in [-1, 1], ensuring numerical stability
- - Bounds: |Φ(s)| ≤ scale ; |Δ(s,s')| ≤ (1+γ)*scale
+ **Theoretical Guarantee:**
+ Canonical + no additives ⇒ Σ_t γ^t·Δ_t = 0
+ (Φ(start) = Φ(end) = 0)
+
+ **Deviations from Theory:**
+ - Heuristic exit modes violate invariance
+ - Entry/exit additives break policy invariance
+ - Non-canonical modes introduce path dependence
+
+ **Robustness:**
+ - All transforms bounded: |T_x| ≤ 1
+ - Validation: |Φ(s)| ≤ scale
+ - Bounds: |Δ(s,a,s')| ≤ (1+γ)·scale
+ - Terminal enforcement: Φ(s) = 0 when terminated
+
+ Implementation Details
+ ----------------------
+ This method wraps the core PBRS logic for use in the RL environment:
+ - Reads Φ(s) from self._last_potential (previous state potential)
+ - Reads γ from self._potential_gamma
+ - Reads configuration from self._exit_potential_mode, self._entry_additive_enabled, etc.
+ - Computes next_position, next_duration_ratio, is_entry, is_exit internally
+ - Stores Φ(s') to self._last_potential for next step
+ - Updates diagnostic accumulators (_total_reward_shaping, _total_entry_additive, etc.)
Parameters
----------
- base_reward : float
- Original reward before shaping
action : int
- Action taken leading to transition
+ Action taken: determines transition type (entry/hold/exit)
trade_duration : float
- Current trade duration in candles
+ Current trade duration in candles (for current state s)
max_trade_duration : float
- Maximum allowed trade duration
+ Maximum allowed trade duration (for normalization)
pnl : float
- Current position PnL
+ Current position PnL (for current state s)
pnl_target : float
- Target PnL for normalization
+ Target PnL for ratio normalization: r_pnl = pnl / pnl_target
Returns
-------
- float
- Shaped reward R'(s,a,s') = R_base + Δ(s,s') + optional_additives
+ tuple[float, float, float]
+ (reward_shaping, entry_additive, exit_additive)
+
+ - reward_shaping: Δ(s,a,s') = γ·Φ(s') - Φ(s), the PBRS shaping term
+ - entry_additive: optional non-PBRS entry bonus (0.0 if disabled or not entry)
+ - exit_additive: optional non-PBRS exit bonus (0.0 if disabled or not exit)
Notes
-----
- - Canonical mode recommended for invariance
- - Monitor discounted Σ γ^t Δ_t (≈0 per episode canonical)
- - Heuristic exit modes may affect convergence
- - Transform validation delegated to analysis tools
- - Φ reset at exits (canonical) enables telescoping cancellation
+ **State Management:**
+ - Current Φ(s): read from self._last_potential
+ - Next Φ(s'): computed and stored to self._last_potential
+ - Transition type: inferred from self._position and action
+
+ **Configuration Sources:**
+ - γ: self._potential_gamma
+ - Exit mode: self._exit_potential_mode
+ - Additives: self._entry_additive_enabled, self._exit_additive_enabled
+ - Transforms: self._hold_potential_transform_pnl, etc.
+
+ **Recommendations:**
+ - Use canonical mode for policy-invariant shaping
+ - Monitor Σ_t γ^t·Δ_t ≈ 0 per episode in canonical mode
+ - Disable additives to preserve theoretical PBRS guarantees
+
+ See Also
+ --------
+ reward_space_analysis.compute_pbrs_components : Stateless version for analysis
"""
+ prev_potential = float(self._last_potential)
+
if not self._hold_potential_enabled and not (
self._entry_additive_enabled or self._exit_additive_enabled
):
- return base_reward
- prev_potential = self._last_potential
+ self._last_prev_potential = float(prev_potential)
+ self._last_next_potential = float(prev_potential)
+ self._last_entry_additive = 0.0
+ self._last_exit_additive = 0.0
+ self._last_reward_shaping = 0.0
+ return 0.0, 0.0, 0.0
+
next_position, next_trade_duration, next_pnl = self._get_next_transition_state(
action=action, trade_duration=trade_duration, pnl=pnl
)
) and next_position in (Positions.Long, Positions.Short)
gamma = self._potential_gamma
- if is_entry:
+
+ reward_shaping = 0.0
+ entry_additive = 0.0
+ exit_additive = 0.0
+ next_potential = prev_potential
+
+ if is_entry or is_hold:
if self._hold_potential_enabled:
- potential = self._compute_hold_potential(
+ next_potential = self._compute_hold_potential(
next_position, next_duration_ratio, next_pnl, pnl_target
)
- reward_shaping = gamma * potential - prev_potential
- self._last_potential = potential
+ reward_shaping = gamma * next_potential - prev_potential
else:
+ next_potential = 0.0
reward_shaping = 0.0
- self._last_potential = 0.0
- self._last_exit_additive = 0.0
- self._last_entry_additive = 0.0
- entry_additive = 0.0
- if self._entry_additive_enabled and not self.is_pbrs_invariant_mode():
+
+ if (
+ is_entry
+ and self._entry_additive_enabled
+ and not self.is_pbrs_invariant_mode()
+ ):
entry_additive = self._compute_entry_additive(
pnl=next_pnl,
pnl_target=pnl_target,
duration_ratio=next_duration_ratio,
)
- self._last_entry_additive = float(entry_additive)
self._total_entry_additive += float(entry_additive)
- self._last_reward_shaping = float(reward_shaping)
- self._total_reward_shaping += float(reward_shaping)
- self._last_prev_potential = float(prev_potential)
- self._last_next_potential = float(self._last_potential)
- return base_reward + reward_shaping + entry_additive
- elif is_hold:
- if self._hold_potential_enabled:
- potential = self._compute_hold_potential(
- next_position, next_duration_ratio, next_pnl, pnl_target
- )
- reward_shaping = gamma * potential - prev_potential
- self._last_potential = potential
- else:
- reward_shaping = 0.0
- self._last_potential = 0.0
- self._last_entry_additive = 0.0
- self._last_exit_additive = 0.0
- self._last_reward_shaping = float(reward_shaping)
- self._total_reward_shaping += float(reward_shaping)
- self._last_prev_potential = float(prev_potential)
- self._last_next_potential = float(self._last_potential)
- return base_reward + reward_shaping
+
elif is_exit:
if (
self._exit_potential_mode
== ReforceXY._EXIT_POTENTIAL_MODES[1] # "non_canonical"
):
next_potential = 0.0
- exit_reward_shaping = -prev_potential
+ reward_shaping = -prev_potential
else:
next_potential = self._compute_exit_potential(prev_potential, gamma)
- exit_reward_shaping = gamma * next_potential - prev_potential
- self._last_entry_additive = 0.0
- self._last_exit_additive = 0.0
- exit_additive = 0.0
+ reward_shaping = gamma * next_potential - prev_potential
+
if self._exit_additive_enabled and not self.is_pbrs_invariant_mode():
duration_ratio = trade_duration / max(max_trade_duration, 1)
exit_additive = self._compute_exit_additive(
pnl, pnl_target, duration_ratio
)
- self._last_exit_additive = float(exit_additive)
self._total_exit_additive += float(exit_additive)
- self._last_potential = next_potential
- self._last_reward_shaping = float(exit_reward_shaping)
- self._total_reward_shaping += float(exit_reward_shaping)
- self._last_prev_potential = float(prev_potential)
- self._last_next_potential = float(self._last_potential)
- return base_reward + exit_reward_shaping + exit_additive
+
else:
# Neutral self-loop
- self._last_prev_potential = float(prev_potential)
- self._last_next_potential = float(self._last_potential)
- self._last_entry_additive = 0.0
- self._last_exit_additive = 0.0
- self._last_reward_shaping = 0.0
- return base_reward
+ next_potential = prev_potential
+ reward_shaping = 0.0
+
+ self._last_potential = float(next_potential)
+ self._last_prev_potential = float(prev_potential)
+ self._last_next_potential = float(self._last_potential)
+ self._last_entry_additive = float(entry_additive)
+ self._last_exit_additive = float(exit_additive)
+ self._last_reward_shaping = float(reward_shaping)
+ self._total_reward_shaping += float(reward_shaping)
+
+ return float(reward_shaping), float(entry_additive), float(exit_additive)
def _set_observation_space(self) -> None:
"""
3. Hold overtime penalty
4. Exit reward
5. Default fallback (0.0 if no specific reward)
- 6. PBRS application: R'(s,a,s') = R_base + Δ(s,s') + optional_additives
+ 6. PBRS computation and application: R'(s,a,s') = R_base + Δ(s,a,s') + optional_additives
The final shaped reward is what the RL agent receives for learning.
In canonical PBRS mode, the learned policy is theoretically equivalent
Returns
-------
float
- Shaped reward R'(s,a,s') = R_base + Δ(s,s') + optional_additives
+ Shaped reward R'(s,a,s') = R_base + Δ(s,a,s') + optional_additives
"""
model_reward_parameters = self.rl_config.get("model_reward_parameters", {})
base_reward: Optional[float] = None
base_factor = float(
model_reward_parameters.get("base_factor", ReforceXY.DEFAULT_BASE_FACTOR)
)
- idle_factor = base_factor * (self.profit_aim / 4.0)
+ idle_factor = base_factor * (self.profit_aim / self.rr) / 4.0
hold_factor = idle_factor
# 2. Idle penalty
base_reward = 0.0
# 6. Potential-based reward shaping
- return self._apply_potential_shaping(
- base_reward=base_reward,
+ reward_shaping, entry_additive, exit_additive = self._compute_pbrs_components(
action=action,
trade_duration=trade_duration,
max_trade_duration=max_trade_duration,
pnl_target=self._pnl_target,
)
+ return base_reward + reward_shaping + entry_additive + exit_additive
+
def _get_observation(self) -> NDArray[np.float32]:
start_idx = max(self._start_tick, self._current_tick - self.window_size)
end_idx = min(self._current_tick, len(self.signal_features))