From: Jérôme Benoit Date: Tue, 23 Sep 2025 13:52:16 +0000 (+0200) Subject: perf(reforcexy): inital revamp of rewarding logic X-Git-Url: https://git.piment-noir.org/?a=commitdiff_plain;h=fb2dc038ffb37d3d6991a25535f01f42eeeff361;p=freqai-strategies.git perf(reforcexy): inital revamp of rewarding logic * more continous reward space for gradient friendlyness * HH and LL based reward when position if open Signed-off-by: Jérôme Benoit --- diff --git a/ReforceXY/user_data/freqaimodels/ReforceXY.py b/ReforceXY/user_data/freqaimodels/ReforceXY.py index 3ead882..026fe0d 100644 --- a/ReforceXY/user_data/freqaimodels/ReforceXY.py +++ b/ReforceXY/user_data/freqaimodels/ReforceXY.py @@ -2,6 +2,7 @@ import copy import gc import json import logging +import math import time import warnings from collections import defaultdict @@ -1121,6 +1122,8 @@ class MyRLEnv(Base5ActionRLEnv): self.timeout: int = self.rl_config.get("max_trade_duration_candles", 128) self._last_closed_position: Optional[Positions] = None self._last_closed_trade_tick: int = 0 + self._max_unrealized_profit: float = -np.inf + self._min_unrealized_profit: float = np.inf if self.force_actions: logger.info( "%s - take_profit: %s, stop_loss: %s, timeout: %s candles (%s days), observation_space: %s", @@ -1187,18 +1190,50 @@ class MyRLEnv(Base5ActionRLEnv): """ Compute the reward factor at trade exit """ - if trade_duration <= max_trade_duration: - factor *= 1.5 - elif trade_duration > max_trade_duration: - factor *= 0.5 + model_reward_parameters = self.rl_config.get("model_reward_parameters", {}) + exit_factor_mode = model_reward_parameters.get("exit_factor_mode", "linear") + duration_ratio = trade_duration / max_trade_duration + + if exit_factor_mode == "legacy": + if trade_duration <= max_trade_duration: + factor *= 1.5 + else: + factor *= 0.5 + elif exit_factor_mode == "sqrt": + factor /= math.sqrt(1.0 + duration_ratio) + elif exit_factor_mode == "linear": + factor /= 1.0 + duration_ratio + elif exit_factor_mode == "hybrid": + if duration_ratio <= 1.0: + factor *= 1.2 - 0.2 * duration_ratio + else: + factor /= math.sqrt(1.0 + (duration_ratio - 1.0)) + if pnl > self.profit_aim * self.rr: - factor *= float( - self.rl_config.get("model_reward_parameters", {}).get( - "win_reward_factor", 2.0 - ) - ) + factor *= float(model_reward_parameters.get("win_reward_factor", 2.0)) + + eff_weight = float(model_reward_parameters.get("exit_efficiency_weight", 0.0)) + eff_center = float(model_reward_parameters.get("exit_efficiency_center", 0.5)) + if eff_weight != 0.0: + max_pnl = self.get_max_unrealized_profit() + min_pnl = self.get_min_unrealized_profit() + range_pnl = max_pnl - min_pnl + if not np.isclose(range_pnl, 0.0): + eff_ratio = (pnl - min_pnl) / range_pnl + factor *= 1.0 + eff_weight * (eff_ratio - eff_center) + factor = max(0.0, factor) + return factor + def _progressive_duration_factor(self, duration_ratio: float) -> float: + if not isinstance(duration_ratio, (int, float)) or duration_ratio <= 0.0: + return 0.0 + duration_overage_ratio = max(0.0, duration_ratio - 1.0) + dynamic_k = 2.5 + 3.0 * duration_overage_ratio + gaussian_decay = math.exp(-dynamic_k * duration_overage_ratio**2.0) + blend_weight = 1.0 - math.exp(-8.0 * duration_overage_ratio) + return (1.0 - blend_weight) * duration_ratio + blend_weight * gaussian_decay + def calculate_reward(self, action: int) -> float: """ An example reward function. This is the one function that users will likely @@ -1229,6 +1264,8 @@ class MyRLEnv(Base5ActionRLEnv): trade_duration = self.get_trade_duration() factor = 100.0 + holding_factor = factor * (self.profit_aim * self.rr) / 3.0 + idle_factor = holding_factor # Force exits if self._force_action in ( @@ -1260,43 +1297,52 @@ class MyRLEnv(Base5ActionRLEnv): # factor = 1 # return 25.0 * factor - # discourage agent from sitting idle too long + # discourage agent from sitting idle if action == Actions.Neutral.value and self._position == Positions.Neutral: - return -0.01 * self.get_idle_duration() ** 1.05 + return ( + -idle_factor * (self.get_idle_duration() / max_trade_duration) ** 1.05 + ) - # pnl and duration aware agent reward while sitting in position + # discourage agent from sitting in position if ( self._position in (Positions.Short, Positions.Long) and action == Actions.Neutral.value ): - duration_fraction = trade_duration / max_trade_duration - max_pnl = max(self.get_most_recent_max_pnl(), pnl) - - if max_pnl > 0: - drawdown_penalty = 0.0025 * factor * (max_pnl - pnl) * duration_fraction - else: - drawdown_penalty = 0.0 - - lambda1 = 0.05 - lambda2 = 0.1 - if pnl >= 0: - if duration_fraction <= 1.0: - duration_penalty_factor = 1.0 - else: - duration_penalty_factor = 1.0 / ( - 1.0 + lambda1 * (duration_fraction - 1.0) - ) - return ( - factor * pnl * duration_penalty_factor - - lambda2 * duration_fraction - - drawdown_penalty - ) - else: - return ( - factor * pnl * (1.0 + lambda1 * duration_fraction) - - 2.0 * lambda2 * duration_fraction - - drawdown_penalty - ) + duration_ratio = trade_duration / max_trade_duration + + max_pnl = self.get_max_unrealized_profit() + min_pnl = self.get_min_unrealized_profit() + range_pnl = max_pnl - min_pnl + if np.isclose(range_pnl, 0.0): + return -holding_factor * duration_ratio + + pnl_from_max = pnl - max_pnl + pnl_from_min = pnl - min_pnl + + threshold_ratio = 0.3 + + ratio_up = max(0.0, pnl_from_max) / range_pnl + ratio_down = max(0.0, -pnl_from_min) / range_pnl + + if ratio_down > threshold_ratio: + scale_down = (ratio_down - threshold_ratio) / (1.0 - threshold_ratio) + return -holding_factor * duration_ratio * (1.0 + scale_down**1.25) + + if ratio_up > threshold_ratio: + scale_up = (ratio_up - threshold_ratio) / (1.0 - threshold_ratio) + duration_factor = self._progressive_duration_factor(duration_ratio) + return holding_factor * duration_factor * scale_up**1.05 + + corr_min = min_pnl - threshold_ratio * range_pnl + corr_max = max_pnl + threshold_ratio * range_pnl + corr_range = corr_max - corr_min + if np.isclose(corr_range, 0.0): + return -holding_factor * duration_ratio + return ( + -holding_factor + * duration_ratio + * (1.0 - (pnl - corr_min) / corr_range) ** 0.65 + ) # close long if action == Actions.Long_exit.value and self._position == Positions.Long: @@ -1365,6 +1411,8 @@ class MyRLEnv(Base5ActionRLEnv): def _enter_trade(self, action: int) -> None: self._position = self._get_position(action) self._last_trade_tick = self._current_tick + self._max_unrealized_profit = -np.inf + self._min_unrealized_profit = np.inf def _exit_trade(self) -> None: self._update_total_profit() @@ -1372,6 +1420,8 @@ class MyRLEnv(Base5ActionRLEnv): self._position = Positions.Neutral self._last_trade_tick = None self._last_closed_trade_tick = self._current_tick + self._max_unrealized_profit = -np.inf + self._min_unrealized_profit = np.inf def execute_trade(self, action: int) -> Optional[str]: """ @@ -1406,7 +1456,7 @@ class MyRLEnv(Base5ActionRLEnv): """ self._current_tick += 1 self._update_unrealized_total_profit() - previous_pnl = self.get_unrealized_profit() + pre_pnl = self.get_unrealized_profit() self._update_portfolio_log_returns() self._force_action = self._get_force_action() reward = self.calculate_reward(action) @@ -1414,18 +1464,27 @@ class MyRLEnv(Base5ActionRLEnv): self.tensorboard_log(Actions._member_names_[action], category="actions") trade_type = self.execute_trade(action) if trade_type is not None: - self.append_trade_history(trade_type, self.current_price(), previous_pnl) + self.append_trade_history(trade_type, self.current_price(), pre_pnl) self._position_history.append(self._position) + pnl = self.get_unrealized_profit() + self._update_max_unrealized_profit(pnl) + self._update_min_unrealized_profit(pnl) + delta_pnl = pnl - pre_pnl info = { "tick": self._current_tick, "position": self._position.value, "action": action, "force_action": (self._force_action.name if self._force_action else None), - "previous_pnl": round(previous_pnl, 5), - "pnl": round(self.get_unrealized_profit(), 5), + "pre_pnl": round(pre_pnl, 5), + "pnl": round(pnl, 5), + "delta_pnl": round(delta_pnl, 5), + "max_pnl": round(self.get_max_unrealized_profit(), 5), + "min_pnl": round(self.get_min_unrealized_profit(), 5), + "most_recent_return": round(self.get_most_recent_return(), 5), + "most_recent_profit": round(self.get_most_recent_profit(), 5), + "total_profit": round(self._total_profit, 5), "reward": round(reward, 5), "total_reward": round(self.total_reward, 5), - "total_profit": round(self._total_profit, 5), "idle_duration": self.get_idle_duration(), "trade_duration": self.get_trade_duration(), "trade_count": int(len(self.trade_history) // 2), @@ -1513,32 +1572,66 @@ class MyRLEnv(Base5ActionRLEnv): return self._current_tick - self._start_tick return self._current_tick - self._last_closed_trade_tick - def get_most_recent_max_pnl(self) -> float: + def get_previous_unrealized_profit(self) -> float: """ - Calculate the most recent maximum unrealized profit if in a trade + Get the previous tick unrealized profit if the agent is in a trade """ if self._last_trade_tick is None: return 0.0 if self._position == Positions.Neutral: return 0.0 - pnl_history = self.history.get("pnl") - if not pnl_history or len(pnl_history) == 0: + elif self._position == Positions.Short: + previous_price = self.add_entry_fee(self.previous_price()) + last_trade_price = self.add_exit_fee( + self.prices.iloc[self._last_trade_tick].open + ) + return (last_trade_price - previous_price) / last_trade_price + elif self._position == Positions.Long: + previous_price = self.add_exit_fee(self.previous_price()) + last_trade_price = self.add_entry_fee( + self.prices.iloc[self._last_trade_tick].open + ) + return (previous_price - last_trade_price) / last_trade_price + else: return 0.0 - pnl_history = np.asarray(pnl_history) - ticks = self.history.get("tick") - if not ticks: + def get_max_unrealized_profit(self) -> float: + """ + Get the maximum unrealized profit if the agent is in a trade + """ + if self._last_trade_tick is None: return 0.0 - ticks = np.asarray(ticks) - start = np.searchsorted(ticks, self._last_trade_tick, side="left") - trade_pnl_history = pnl_history[start:] - if trade_pnl_history.size == 0: + if self._position == Positions.Neutral: return 0.0 - return np.max(trade_pnl_history) + if not np.isfinite(self._max_unrealized_profit): + return self.get_unrealized_profit() + return self._max_unrealized_profit + + def _update_max_unrealized_profit(self, pnl: float) -> None: + if self._position in (Positions.Long, Positions.Short): + if pnl > self._max_unrealized_profit: + self._max_unrealized_profit = pnl + + def get_min_unrealized_profit(self) -> float: + """ + Get the minimum unrealized profit if the agent is in a trade + """ + if self._last_trade_tick is None: + return 0.0 + if self._position == Positions.Neutral: + return 0.0 + if not np.isfinite(self._min_unrealized_profit): + return self.get_unrealized_profit() + return self._min_unrealized_profit + + def _update_min_unrealized_profit(self, pnl: float) -> None: + if self._position in (Positions.Long, Positions.Short): + if pnl < self._min_unrealized_profit: + self._min_unrealized_profit = pnl def get_most_recent_return(self) -> float: """ - Calculate the tick to tick return if in a trade. + Calculate the tick to tick return if the agent is in a trade. Return is generated from rising prices in Long and falling prices in Short positions. The actions Sell/Buy or Hold during a Long position trigger the sell/buy-fee. """ @@ -1549,18 +1642,20 @@ class MyRLEnv(Base5ActionRLEnv): elif self._position == Positions.Long: current_price = self.current_price() previous_price = self.previous_price() + previous_tick = self.previous_tick() if ( - self._position_history[self._current_tick - 1] == Positions.Short - or self._position_history[self._current_tick - 1] == Positions.Neutral + self._position_history[previous_tick] == Positions.Short + or self._position_history[previous_tick] == Positions.Neutral ): previous_price = self.add_entry_fee(previous_price) return np.log(current_price) - np.log(previous_price) elif self._position == Positions.Short: current_price = self.current_price() previous_price = self.previous_price() + previous_tick = self.previous_tick() if ( - self._position_history[self._current_tick - 1] == Positions.Long - or self._position_history[self._current_tick - 1] == Positions.Neutral + self._position_history[previous_tick] == Positions.Long + or self._position_history[previous_tick] == Positions.Neutral ): previous_price = self.add_exit_fee(previous_price) return np.log(previous_price) - np.log(current_price) @@ -1571,7 +1666,7 @@ class MyRLEnv(Base5ActionRLEnv): def get_most_recent_profit(self) -> float: """ - Calculate the tick to tick unrealized profit if in a trade + Calculate the tick to tick unrealized profit if the agent is in a trade """ if self._last_trade_tick is None: return 0.0 @@ -1587,8 +1682,11 @@ class MyRLEnv(Base5ActionRLEnv): return (previous_price - current_price) / previous_price return 0.0 + def previous_tick(self) -> int: + return max(self._current_tick - 1, self._start_tick) + def previous_price(self) -> float: - return self.prices.iloc[self._current_tick - 1].get("open") + return self.prices.iloc[self.previous_tick()].get("open") def get_env_history(self) -> DataFrame: """ @@ -1746,17 +1844,17 @@ class MyRLEnv(Base5ActionRLEnv): color="gray", label="pnl", ) - previous_pnl_series = history.get("previous_pnl") - if previous_pnl_series is not None and len(previous_pnl_series) > 0: + pre_pnl_series = history.get("pre_pnl") + if pre_pnl_series is not None and len(pre_pnl_series) > 0: axs[1].plot( ticks, - previous_pnl_series, + pre_pnl_series, linewidth=1, color="deepskyblue", - label="previous_pnl", + label="pre_pnl", ) if (pnl_series is not None and len(pnl_series) > 0) or ( - previous_pnl_series is not None and len(previous_pnl_series) > 0 + pre_pnl_series is not None and len(pre_pnl_series) > 0 ): axs[1].legend(loc="upper left", fontsize=8) axs[1].axhline(y=0, label="0", alpha=0.33, color="gray")