From: Jérôme Benoit <jerome.benoit@piment-noir.org>
Date: Sat, 15 Feb 2025 22:37:42 +0000 (+0100)
Subject: feat(reforcexy): initial ReforceXY commit
X-Git-Url: https://git.piment-noir.org/?a=commitdiff_plain;h=2e2073901bed82a2f2fbde3305641d77db3a6dae;p=freqai-strategies.git

feat(reforcexy): initial ReforceXY commit

Signed-off-by: Jérôme Benoit <jerome.benoit@piment-noir.org>
---

diff --git a/ReforceXY/docker-compose.yml b/ReforceXY/docker-compose.yml
new file mode 100644
index 0000000..5f2002b
--- /dev/null
+++ b/ReforceXY/docker-compose.yml
@@ -0,0 +1,35 @@
+---
+services:
+  freqtrade:
+    # image: freqtradeorg/freqtrade:stable_freqai
+    # # Enable GPU Image and GPU Resources
+    # # Make sure to uncomment the whole deploy section
+    # deploy:
+    #   resources:
+    #     reservations:
+    #       devices:
+    #         - driver: nvidia
+    #           count: 1
+    #           capabilities: [gpu]
+
+    # Build step - only needed when additional dependencies are needed
+    build:
+      context: .
+      dockerfile: "./docker/Dockerfile.custom"
+    restart: unless-stopped
+    container_name: freqtrade-ReforceXY
+    volumes:
+      - "./user_data:/freqtrade/user_data"
+    # Expose api on port 8082
+    # Please read the https://www.freqtrade.io/en/stable/rest-api/ documentation
+    # for more information.
+    ports:
+      - "0.0.0.0:8082:8080"
+    # Default command used when running `docker compose up`
+    command: >
+      trade
+      --logfile /freqtrade/user_data/logs/freqtrade-ReforceXY.log
+      --db-url sqlite:////freqtrade/user_data/freqtrade-ReforceXY-tradesv3.sqlite
+      --config /freqtrade/user_data/config.json
+      --freqaimodel ReforceXY
+      --strategy RLAgentStrategy
diff --git a/ReforceXY/docker/Dockerfile.custom b/ReforceXY/docker/Dockerfile.custom
new file mode 100644
index 0000000..7b76fe9
--- /dev/null
+++ b/ReforceXY/docker/Dockerfile.custom
@@ -0,0 +1,4 @@
+FROM freqtradeorg/freqtrade:stable_freqairl
+
+ARG optuna_version=4.2.1
+RUN pip install --user optuna==${optuna_version} optuna-dashboard
diff --git a/ReforceXY/user_data/backtest_results/.gitkeep b/ReforceXY/user_data/backtest_results/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/ReforceXY/user_data/config-template.json b/ReforceXY/user_data/config-template.json
new file mode 100644
index 0000000..1ea31f6
--- /dev/null
+++ b/ReforceXY/user_data/config-template.json
@@ -0,0 +1,196 @@
+{
+  "$schema": "https://schema.freqtrade.io/schema.json",
+  "max_open_trades": 10,
+  "stake_currency": "USDT",
+  "stake_amount": "unlimited",
+  "tradable_balance_ratio": 0.99,
+  "fiat_display_currency": "USD",
+  "dry_run": true,
+  "dry_run_wallet": 1000,
+  "cancel_open_orders_on_exit": false,
+  // "trading_mode": "futures",
+  // "margin_mode": "isolated",
+  "trading_mode": "spot",
+  "minimal_roi": {
+    "0": 0.03
+  }, // Take_profit exit value used with force_actions
+  "stoploss": -0.03, // Stop_loss exit value used with force_actions
+  "unfilledtimeout": {
+    "entry": 10,
+    "exit": 10,
+    "exit_timeout_count": 0,
+    "unit": "minutes"
+  },
+  "entry_pricing": {
+    "price_side": "other",
+    "use_order_book": true,
+    "order_book_top": 1,
+    "price_last_balance": 0.0,
+    "check_depth_of_market": {
+      "enabled": false,
+      "bids_to_ask_delta": 1
+    }
+  },
+  "exit_pricing": {
+    "price_side": "other",
+    "use_order_book": true,
+    "order_book_top": 1,
+    "price_last_balance": 0.0
+  },
+  "exchange": {
+    "name": "binance",
+    "key": "",
+    "secret": "",
+    "walletAddress": "",
+    "privateKey": "",
+    "ccxt_config": {
+      "enableRateLimit": true,
+      "rateLimit": 60
+    },
+    "ccxt_async_config": {
+      "enableRateLimit": true,
+      "rateLimit": 60
+    },
+    // Spot top 5
+    "pair_whitelist": [
+      "BTC/USDT",
+      "ETH/USDT",
+      "SOL/USDT",
+      "BNB/USDT",
+      "XRP/USDT"
+    ],
+    // // Spot IA
+    // "pair_whitelist": [
+    //     "NEAR/USDT",
+    //     "ICP/USDT",
+    //     "RENDER/USDT",
+    //     "TAO/USDT",
+    //     "FET/USDT"
+    // ],
+    // // Spot restaking
+    // "pair_whitelist": [
+    //     "PENDLE/USDT",
+    //     "EIGEN/USDT",
+    //     "ETHFI/USDT"
+    // ],
+    // // Spot meme
+    // "pair_whitelist": [
+    //     "DOGE/USDT",
+    //     "PENGU/USDT",
+    //     "SHIB/USDT",
+    //     "PEPE/USDT",
+    //     "BONK/USDT"
+    // ],
+  },
+  "pairlists": [
+    {
+      "method": "StaticPairList"
+    }
+  ],
+  "telegram": {
+    "enabled": false,
+    "token": "",
+    "chat_id": ""
+  },
+  "api_server": {
+    "enabled": false,
+    "listen_ip_address": "0.0.0.0",
+    "listen_port": 8080,
+    "verbosity": "error",
+    "enable_openapi": false,
+    "jwt_secret_key": "",
+    "ws_token": "",
+    "CORS_origins": [],
+    "username": "freqtrader",
+    "password": "freqtrader"
+  },
+  "freqai": {
+    "enabled": true,
+    "conv_width": 1,
+    "purge_old_models": 2,
+    "expiration_hours": 12,
+    "train_period_days": 60,
+    // "live_retrain_hours": 0.5,
+    "backtest_period_days": 2,
+    "write_metrics_to_disk": false,
+    "identifier": "ReforceXY",
+    "fit_live_predictions_candles": 300,
+    "data_kitchen_thread_count": 6, // set to number of CPU threads / 4
+    "track_performance": false,
+    "feature_parameters": {
+      "include_corr_pairlist": [
+        "BTC/USDT",
+        "ETH/USDT"
+      ],
+      "include_timeframes": [
+        "5m",
+        "15m",
+        "1h",
+        "4h"
+      ],
+      "label_period_candles": 100,
+      "include_shifted_candles": 6,
+      "DI_threshold": 10,
+      "weight_factor": 0.9,
+      "principal_component_analysis": false,
+      "use_SVM_to_remove_outliers": false,
+      "use_DBSCAN_to_remove_outliers": false,
+      "indicator_periods_candles": [
+        8,
+        16,
+        32
+      ],
+      "inlier_metric_window": 0,
+      "noise_standard_deviation": 0.02,
+      "reverse_test_train_order": false,
+      "plot_feature_importances": 0,
+      "buffer_train_data_candles": 100
+    },
+    "data_split_parameters": {
+      "test_size": 0.333,
+      "random_state": 1,
+      "shuffle": false
+    },
+    "model_training_parameters": {
+      // "device": "cuda",
+      "verbose": 1
+    },
+    "rl_config": {
+      "model_type": "PPO",
+      "policy_type": "MlpPolicy",
+      "model_reward_parameters": {
+        "rr": 1,
+        "profit_aim": 0.025,
+        "win_reward_factor": 2
+      },
+      "train_cycles": 25,
+      "add_state_info": true,
+      "cpu_count": 6,
+      "max_training_drawdown_pct": 0.02,
+      "max_trade_duration_candles": 96, // Timeout exit value used with force_actions
+      "force_actions": false, // Utilize minimal_roi, stoploss, and max_trade_duration_candles as TP/SL/Timeout in the environment
+      "n_envs": 1, // Number of DummyVecEnv environments
+      "frame_staking": 0, // Number of VecFrameStack stacks (set to 1 to use)
+      "lr_schedule": false, // Enable learning rate linear schedule
+      "cr_schedule": false, // Enable clip range linear schedule
+      "max_no_improvement_evals": 0, // Maximum consecutive evaluations without a new best model
+      "min_evals": 0, // Number of evaluations before start to count evaluations without improvements
+      "check_envs": true, // Check that an environment follows Gym API
+      "plot_new_best": false // Enable tensorboard rollout plot upon finding a new best model
+    },
+    "rl_config_optuna": {
+      "enabled": false, // Enable optuna hyperopt
+      "n_jobs": 6,
+      "n_trials": 100,
+      "n_startup_trials": 10,
+      "timeout_hours": 0
+    }
+  },
+  "bot_name": "freqtrade-ReforceXY",
+  "initial_state": "running",
+  "timeframe": "5m",
+  "force_entry_enable": false,
+  "internals": {
+    "process_throttle_secs": 5
+  }
+}
diff --git a/ReforceXY/user_data/data/.gitkeep b/ReforceXY/user_data/data/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/ReforceXY/user_data/freqaimodels/ReforceXY.py b/ReforceXY/user_data/freqaimodels/ReforceXY.py
new file mode 100644
index 0000000..b4e6379
--- /dev/null
+++ b/ReforceXY/user_data/freqaimodels/ReforceXY.py
@@ -0,0 +1,1350 @@
+import copy
+import gc
+import json
+import logging
+import warnings
+from enum import Enum
+from pathlib import Path
+from typing import Any, Callable, Dict, Optional, Type, Tuple
+
+import matplotlib
+import matplotlib.pyplot as plt
+import matplotlib.transforms as mtransforms
+import numpy as np
+import torch as th
+from gymnasium import Env
+from gymnasium.spaces import Box
+from optuna import Trial, TrialPruned, create_study
+from optuna.exceptions import ExperimentalWarning
+from optuna.pruners import HyperbandPruner
+from optuna.samplers import TPESampler
+from optuna.study import Study, StudyDirection
+from optuna.storages import JournalStorage
+from optuna.storages.journal import JournalFileBackend
+from pandas import DataFrame, concat, merge
+from sb3_contrib.common.maskable.callbacks import MaskableEvalCallback
+from stable_baselines3.common.callbacks import (
+    BaseCallback,
+    ProgressBarCallback,
+    StopTrainingOnNoModelImprovement,
+)
+from stable_baselines3.common.env_checker import check_env
+from stable_baselines3.common.logger import Figure, HParam
+from stable_baselines3.common.utils import set_random_seed
+from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack, VecMonitor
+
+from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
+from freqtrade.freqai.RL.Base5ActionRLEnv import Actions, Base5ActionRLEnv, Positions
+from freqtrade.freqai.RL.BaseEnvironment import BaseEnvironment
+from freqtrade.freqai.RL.BaseReinforcementLearningModel import (
+    BaseReinforcementLearningModel,
+)
+from freqtrade.freqai.tensorboard.TensorboardCallback import TensorboardCallback
+from freqtrade.strategy import timeframe_to_minutes
+
+
+matplotlib.use("Agg")
+warnings.filterwarnings("ignore", category=UserWarning)
+warnings.filterwarnings("ignore", category=FutureWarning)
+warnings.filterwarnings("ignore", category=ExperimentalWarning)
+logger = logging.getLogger(__name__)
+
+
+class ForceActions(Enum):
+    Take_profit = 1
+    Stop_loss = 2
+    Timeout = 3
+
+
+class ReforceXY(BaseReinforcementLearningModel):
+    """
+    Custom Freqtrade Freqai reinforcement learning prediction model.
+    Model specific config:
+    {
+        "freqaimodel": "ReforceXY",
+        "strategy": "RLAgentStrategy",
+        "minimal_roi": {"0": 0.03},                 // Take_profit exit value used with force_actions
+        "stoploss": -0.03,                          // Stop_loss exit value used with force_actions
+        ...
+        "freqai": {
+            ...
+            "rl_config": {
+                ...
+                "max_trade_duration_candles": 96,   // Timeout exit value used with force_actions
+                "force_actions": false,             // Utilize minimal_roi, stoploss, and max_trade_duration_candles as TP/SL/Timeout in the environment
+                "n_envs": 1,                        // Number of DummyVecEnv environments
+                "frame_staking": 0,                 // Number of VecFrameStack stacks (set to 1 to use)
+                "lr_schedule": false,               // Enable learning rate linear schedule
+                "cr_schedule": false,               // Enable clip range linear schedule
+                "max_no_improvement_evals": 0,      // Maximum consecutive evaluations without a new best model
+                "min_evals": 0,                     // Number of evaluations before start to count evaluations without improvements
+                "check_envs": true,                 // Check that an environment follows Gym API
+                "plot_new_best": false,             // Enable tensorboard rollout plot upon finding a new best model
+            },
+            "rl_config_optuna": {
+                "enabled": false,                   // Enable optuna hyperopt
+                "n_jobs": 1,
+                "n_trials": 100,
+                "n_startup_trials": 10,
+                "timeout_hours": 0,
+            }
+        }
+    }
+    Requirements:
+        - pip install optuna
+
+    Optional:
+        - pip install optuna-dashboard
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.is_maskable: bool = (
+            self.model_type == "MaskablePPO"
+        )  # Enable action masking
+        self.lr_schedule: bool = self.rl_config.get("lr_schedule", False)
+        self.cr_schedule: bool = self.rl_config.get("cr_schedule", False)
+        self.n_envs: int = self.rl_config.get("n_envs", 1)
+        self.frame_staking: int = self.rl_config.get("frame_staking", 0)
+        self.frame_staking += 1 if self.frame_staking == 1 else 0
+        self.max_no_improvement_evals: int = self.rl_config.get(
+            "max_no_improvement_evals", 0
+        )
+        self.min_evals: int = self.rl_config.get("min_evals", 0)
+        self.plot_new_best: bool = self.rl_config.get("plot_new_best", False)
+        self.check_envs: bool = self.rl_config.get("check_envs", True)
+        self.progressbar_callback: Optional[ProgressBarCallback] = None
+        # Optuna hyperopt
+        self.rl_config_optuna: dict = self.freqai_info.get("rl_config_optuna", {})
+        self.hyperopt: bool = self.rl_config_optuna.get("enabled", False)
+        self.optuna_timeout_hours: float = self.rl_config_optuna.get("timeout_hours", 0)
+        self.optuna_n_trials: int = self.rl_config_optuna.get("n_trials", 100)
+        self.optuna_n_startup_trials: int = self.rl_config_optuna.get(
+            "n_startup_trials", 10
+        )
+        self.optuna_trial_params: list = []
+        self.optuna_callback: Optional[MaskableTrialEvalCallback] = None
+        self.unset_unsupported()
+
+    def unset_unsupported(self) -> None:
+        """
+        If user has activated any custom function that may conflict, this
+        function will set them to false and warn them
+        """
+        if self.continual_learning and self.frame_staking:
+            logger.warning(
+                "User tried to use continual_learning with frame_staking. \
+                Deactivating continual_learning"
+            )
+            self.continual_learning = False
+
+    def set_train_and_eval_environments(
+        self,
+        data_dictionary: Dict[str, DataFrame],
+        prices_train: DataFrame,
+        prices_test: DataFrame,
+        dk: FreqaiDataKitchen,
+    ) -> None:
+        """
+        Set training and evaluation environments
+        """
+        self.close_envs()
+
+        train_df = data_dictionary["train_features"]
+        test_df = data_dictionary["test_features"]
+        env_dict = self.pack_env_dict(dk.pair)
+        seed = self.model_training_parameters.get("seed", 42)
+
+        if self.check_envs:
+            logger.info("Checking environments...")
+            check_env(
+                self.MyRLEnv(
+                    id="train_env_check", df=train_df, prices=prices_train, **env_dict
+                )
+            )
+            check_env(
+                self.MyRLEnv(
+                    id="eval_env_check", df=test_df, prices=prices_test, **env_dict
+                )
+            )
+
+        logger.info("Populating environments: %s", self.n_envs)
+        train_env = DummyVecEnv(
+            [
+                make_env(
+                    self.MyRLEnv,
+                    "train_env",
+                    i,
+                    seed,
+                    train_df,
+                    prices_train,
+                    env_info=env_dict,
+                )
+                for i in range(self.n_envs)
+            ]
+        )
+        eval_env = DummyVecEnv(
+            [
+                make_env(
+                    self.MyRLEnv,
+                    "eval_env",
+                    i,
+                    seed,
+                    test_df,
+                    prices_test,
+                    env_info=env_dict,
+                )
+                for i in range(self.n_envs)
+            ]
+        )
+        if self.frame_staking:
+            logger.info("Frame staking: %s", self.frame_staking)
+            train_env = VecFrameStack(train_env, n_stack=self.frame_staking)
+            eval_env = VecFrameStack(eval_env, n_stack=self.frame_staking)
+
+        self.train_env = VecMonitor(train_env)
+        self.eval_env = VecMonitor(eval_env)
+
+    def get_model_params(self) -> Dict:
+        """
+        Get model parameters
+        """
+        model_params: Dict = copy.deepcopy(self.model_training_parameters)
+
+        if self.lr_schedule:
+            _lr = model_params.get("learning_rate", 0.0003)
+            model_params["learning_rate"] = linear_schedule(_lr)
+            logger.info("Learning rate linear schedule enabled, initial value: %s", _lr)
+
+        if self.cr_schedule:
+            _cr = model_params.get("clip_range", 0.2)
+            model_params["clip_range"] = linear_schedule(_cr)
+            logger.info("Clip range linear schedule enabled, initial value: %s", _cr)
+
+        model_params["policy_kwargs"] = {
+            "net_arch": self.net_arch,
+            "activation_fn": th.nn.ReLU,
+            "optimizer_class": th.optim.Adam,
+            # "ortho_init": True
+        }
+        if "PPO" in self.model_type:
+            model_params["policy_kwargs"]["net_arch"] = {
+                "pi": self.net_arch,
+                "vf": self.net_arch,
+            }
+
+        return model_params
+
+    def get_callbacks(
+        self, eval_freq: int, data_path: str, trial: Trial = None
+    ) -> list:
+        """
+        Get the model specific callbacks
+        """
+        callbacks: list[BaseCallback] = []
+        no_improvement_callback = None
+        rollout_plot_callback = None
+        verbose = self.model_training_parameters.get("verbose", 0)
+
+        if self.n_envs > 1:
+            eval_freq //= self.n_envs
+
+        if self.plot_new_best:
+            rollout_plot_callback = RolloutPlotCallback(verbose=verbose)
+
+        if self.max_no_improvement_evals:
+            no_improvement_callback = StopTrainingOnNoModelImprovement(
+                max_no_improvement_evals=self.max_no_improvement_evals,
+                min_evals=self.min_evals,
+                verbose=1,
+            )
+
+        if self.activate_tensorboard:
+            info_callback = InfoMetricsCallback(actions=Actions, verbose=verbose)
+            callbacks.append(info_callback)
+
+        if self.rl_config.get("progress_bar", False):
+            self.progressbar_callback = ProgressBarCallback()
+            callbacks.append(self.progressbar_callback)
+
+        self.eval_callback = MaskableEvalCallback(
+            self.eval_env,
+            eval_freq=eval_freq,
+            deterministic=True,
+            render=False,
+            best_model_save_path=data_path,
+            use_masking=self.is_maskable,
+            callback_on_new_best=rollout_plot_callback,
+            callback_after_eval=no_improvement_callback,
+            verbose=verbose,
+        )
+        callbacks.append(self.eval_callback)
+
+        if not trial:
+            return callbacks
+
+        self.optuna_callback = MaskableTrialEvalCallback(
+            self.eval_env,
+            trial,
+            eval_freq=eval_freq,
+            deterministic=True,
+            render=False,
+            best_model_save_path=data_path,
+            use_masking=self.is_maskable,
+            verbose=verbose,
+        )
+        callbacks.append(self.optuna_callback)
+        return callbacks
+
+    def fit(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen, **kwargs):
+        """
+        User customizable fit method
+        :param data_dictionary: dict = common data dictionary containing all train/test
+            features/labels/weights.
+        :param dk: FreqaiDatakitchen = data kitchen for current pair.
+        :return:
+        model Any = trained model to be used for inference in dry/live/backtesting
+        """
+        train_df = data_dictionary["train_features"]
+        train_timesteps = len(train_df)
+        test_timesteps = len(data_dictionary["test_features"])
+        train_cycles = int(self.rl_config.get("train_cycles", 25))
+        total_timesteps = train_timesteps * train_cycles
+        train_days = steps_to_days(train_timesteps, self.config["timeframe"])
+        total_days = steps_to_days(total_timesteps, self.config["timeframe"])
+
+        logger.info("Action masking: %s", self.is_maskable)
+        logger.info(
+            "Train: %s steps (%s days) * %s cycles = Total %s (%s days)",
+            train_timesteps,
+            train_days,
+            train_cycles,
+            total_timesteps,
+            total_days,
+        )
+        logger.info(
+            "Test: %s steps (%s days)",
+            test_timesteps,
+            steps_to_days(test_timesteps, self.config["timeframe"]),
+        )
+        logger.info("Hyperopt: %s", self.hyperopt)
+
+        if self.hyperopt:
+            model_params = self.study(train_df, total_timesteps, dk)
+        else:
+            model_params = self.get_model_params()
+        logger.info("%s params: %s", self.model_type, model_params)
+
+        if self.activate_tensorboard:
+            tb_path = Path(dk.full_path / "tensorboard" / dk.pair.split("/")[0])
+        else:
+            tb_path = None
+
+        if dk.pair not in self.dd.model_dictionary or not self.continual_learning:
+            model = self.MODELCLASS(
+                self.policy_type,
+                self.train_env,
+                tensorboard_log=tb_path,
+                **model_params,
+            )
+        else:
+            logger.info(
+                "Continual training activated - starting training from previously trained agent."
+            )
+            model = self.dd.model_dictionary[dk.pair]
+            model.set_env(self.train_env)
+
+        callbacks = self.get_callbacks(train_timesteps, str(dk.data_path))
+        try:
+            model.learn(total_timesteps=total_timesteps, callback=callbacks)
+        finally:
+            if self.progressbar_callback:
+                self.progressbar_callback.on_training_end()
+
+        if Path(dk.data_path / "best_model.zip").is_file():
+            logger.info("Callback found a best model.")
+            best_model = self.MODELCLASS.load(dk.data_path / "best_model")
+            return best_model
+
+        logger.info("Couldn't find best model, using final model instead.")
+
+        return model
+
+    def get_state_info(self, pair: str) -> Tuple[float, float, int]:
+        """
+        State info during dry/live (not backtesting) which is fed back
+        into the model.
+        :param pair: str = COIN/STAKE to get the environment information for
+        :return:
+        :market_side: float = representing short, long, or neutral for
+            pair
+        :current_profit: float = unrealized profit of the current trade
+        :trade_duration: int = the number of candles that the trade has
+            been open for
+        """
+        # STATE_INFO
+        position, pnl, trade_duration = super().get_state_info(pair)
+        return position, pnl, int(trade_duration)
+
+    def rl_model_predict(
+        self, dataframe: DataFrame, dk: FreqaiDataKitchen, model: Any
+    ) -> DataFrame:
+        """
+        A helper function to make predictions in the Reinforcement learning module.
+        :param dataframe: DataFrame = the dataframe of features to make the predictions on
+        :param dk: FreqaiDatakitchen = data kitchen for the current pair
+        :param model: Any = the trained model used to inference the features.
+        """
+
+        def _is_valid(action: int, position: float) -> bool:
+            return not (
+                (
+                    action in (Actions.Short_enter.value, Actions.Long_enter.value)
+                    and position != Positions.Neutral.value
+                )
+                or (
+                    action == Actions.Long_exit.value
+                    and position != Positions.Long.value
+                )
+                or (
+                    action == Actions.Short_exit.value
+                    and position != Positions.Short.value
+                )
+            )
+
+        def _action_masks(position: float):
+            return [_is_valid(action.value, position) for action in Actions]
+
+        def _predict(window):
+            observations: DataFrame = dataframe.iloc[window.index]
+            action_masks_param: dict = {}
+
+            if self.live and self.rl_config.get("add_state_info", False):
+                position, pnl, trade_duration = self.get_state_info(dk.pair)
+                # STATE_INFO
+                observations["pnl"] = pnl
+                observations["position"] = position
+                observations["trade_duration"] = trade_duration
+
+                if self.is_maskable:
+                    action_masks_param = {"action_masks": _action_masks(position)}
+
+            observations = observations.to_numpy(dtype=np.float32)
+
+            if self.frame_staking:
+                observations = np.repeat(
+                    observations, axis=1, repeats=self.frame_staking
+                )
+
+            action, _ = model.predict(
+                observations, deterministic=True, **action_masks_param
+            )
+            return action
+
+        output = DataFrame(np.zeros(len(dataframe)), columns=dk.label_list)
+        output = output.rolling(window=self.CONV_WIDTH).apply(_predict)
+        return output
+
+    def study(self, train_df, total_timesteps: int, dk: FreqaiDataKitchen) -> Dict:
+        """
+        Runs hyperparameter optimization using Optuna and
+        returns the best hyperparameters found
+        """
+        storage_dir, study_name = str(dk.full_path).rsplit("/", 1)
+        storage_backend = self.rl_config_optuna.get("storage", "file")
+        if storage_backend == "sqlite":
+            storage = f"sqlite:///{storage_dir}/optuna.sqlite"
+        elif storage_backend == "file":
+            storage = JournalStorage(JournalFileBackend(f"{storage_dir}/optuna.log"))
+        study: Study = create_study(
+            study_name=study_name,
+            sampler=TPESampler(
+                n_startup_trials=self.optuna_n_startup_trials,
+                multivariate=True,
+                group=True,
+            ),
+            pruner=HyperbandPruner(
+                min_resource=1, max_resource=self.optuna_n_trials, reduction_factor=3
+            ),
+            direction=StudyDirection.MAXIMIZE,
+            storage=storage,
+            load_if_exists=True,
+        )
+        try:
+            study.optimize(
+                lambda trial: self.objective(trial, train_df, total_timesteps, dk),
+                n_trials=self.optuna_n_trials,
+                timeout=(
+                    hours_to_seconds(self.optuna_timeout_hours)
+                    if self.optuna_timeout_hours
+                    else None
+                ),
+                gc_after_trial=True,
+                show_progress_bar=self.rl_config.get("progress_bar", False),
+                n_jobs=self.rl_config_optuna.get("n_jobs", 1),
+            )
+        except KeyboardInterrupt:
+            pass
+
+        logger.info("------------ Hyperopt results ------------")
+        logger.info(
+            "Best trial: %s. Score: %s", study.best_trial.number, study.best_trial.value
+        )
+        logger.info(
+            "Best trial params: %s", self.optuna_trial_params[study.best_trial.number]
+        )
+        logger.info("-----------------------------------------")
+
+        best_trial_path = Path(dk.full_path / "hyperopt_best_trial.json")
+        logger.info("dumping json to %s", best_trial_path)
+        with best_trial_path.open("w", encoding="utf-8") as write_file:
+            json.dump(study.best_trial.params, write_file, indent=4)
+
+        return self.optuna_trial_params[study.best_trial.number]
+
+    def objective(
+        self, trial: Trial, train_df, total_timesteps: int, dk: FreqaiDataKitchen
+    ) -> float:
+        """
+        Defines a single trial for hyperparameter optimization using Optuna
+        """
+        if "PPO" in self.model_type:
+            params = sample_params_ppo(trial)
+        elif "QRDQN" in self.model_type:
+            params = sample_params_qrdqn(trial)
+        elif "DQN" in self.model_type:
+            params = sample_params_dqn(trial)
+        else:
+            raise NotImplementedError
+
+        params = {**self.model_training_parameters, **params}
+
+        nan_encountered = False
+        tensorboard_log_path = Path(
+            dk.full_path / "tensorboard" / dk.pair.split("/")[0]
+        )
+
+        logger.info("------------ Hyperopt ------------")
+        logger.info("------------ Trial %s ------------", trial.number)
+        logger.info("Trial %s params: %s", trial.number, params)
+        self.optuna_trial_params.append(params)
+
+        model = self.MODELCLASS(
+            self.policy_type,
+            self.train_env,
+            tensorboard_log=tensorboard_log_path,
+            **params,
+        )
+        callbacks = self.get_callbacks(len(train_df), str(dk.data_path), trial)
+
+        try:
+            model.learn(total_timesteps=total_timesteps, callback=callbacks)
+        except AssertionError:
+            logger.warning("Optuna encountered NaN")
+            nan_encountered = True
+        finally:
+            if self.progressbar_callback:
+                self.progressbar_callback.on_training_end()
+            self.close_envs()
+            model.env.close()
+
+        if nan_encountered:
+            return float("nan")
+
+        if self.optuna_callback.is_pruned:
+            raise TrialPruned()
+
+        return self.optuna_callback.best_mean_reward
+
+    def close_envs(self):
+        """
+        Closes the training and evaluation environments if they are open
+        """
+        if self.train_env:
+            self.train_env.close()
+        if self.eval_env:
+            self.eval_env.close()
+
+    class MyRLEnv(Base5ActionRLEnv):
+        """
+        Env
+        """
+
+        def __init__(self, **kwargs):
+            super().__init__(**kwargs)
+            self.force_actions: bool = self.rl_config.get("force_actions", False)
+            self._force_action: ForceActions = None
+            self.take_profit: float = self.config["minimal_roi"]["0"]
+            self.stop_loss: float = self.config["stoploss"]
+            self.timeout: int = self.rl_config.get("max_trade_duration_candles", 128)
+            self._last_closed_position: Positions = None
+            self._last_closed_trade_tick: int = 0
+            # self.reward_range = (-1, 1)
+            if self.force_actions:
+                logger.info(
+                    "%s - take_profit: %s, stop_loss: %s, timeout: %s candles (%s days), observation_space: %s",
+                    self.id,
+                    self.take_profit,
+                    self.stop_loss,
+                    self.timeout,
+                    steps_to_days(self.timeout, self.config["timeframe"]),
+                    self.observation_space,
+                )
+
+        def reset_env(
+            self,
+            df: DataFrame,
+            prices: DataFrame,
+            window_size: int,
+            reward_kwargs: dict,
+            starting_point=True,
+        ):
+            """
+            Resets the environment when the agent fails
+            """
+            super().reset_env(df, prices, window_size, reward_kwargs, starting_point)
+            self.state_features = ["pnl", "position", "trade_duration"]  # STATE_INFO
+            if self.add_state_info:
+                self.total_features = self.signal_features.shape[1] + len(
+                    self.state_features
+                )
+                self.shape = (window_size, self.total_features)
+
+            self.observation_space = Box(
+                low=-np.inf, high=np.inf, shape=self.shape, dtype=np.float32
+            )
+
+        def reset(self, seed=None, **kwargs):
+            """
+            Reset is called at the beginning of every episode
+            """
+            _, history = super().reset(seed, **kwargs)
+            self._force_action: ForceActions = None
+            self._last_closed_position: Positions = None
+            self._last_closed_trade_tick: int = 0
+            return self._get_observation(), history
+
+        def calculate_reward(self, action) -> float:
+            """
+            An example reward function. This is the one function that users will likely
+            wish to inject their own creativity into.
+
+            Warning!
+            This is function is a showcase of functionality designed to show as many possible
+            environment control features as possible. It is also designed to run quickly
+            on small computers. This is a benchmark, it is *not* for live production.
+
+            :param action: int = The action made by the agent for the current candle.
+            :return:
+            float = the reward to give to the agent for current step (used for optimization
+                of weights in NN)
+            """
+            pnl = self.get_unrealized_profit()
+            # mrr = self.get_most_recent_return()
+            # mrp = self.get_most_recent_profit()
+
+            factor = 100.0
+
+            # Force exits
+            if self._force_action in (
+                ForceActions.Take_profit,
+                ForceActions.Stop_loss,
+                ForceActions.Timeout,
+            ):
+                return pnl * factor
+
+            # first, penalize if the action is not valid
+            if not self._is_valid(action):
+                return -2
+
+            # # you can use feature values from dataframe
+            # rsi_now = self.get_feature_value(
+            #     name="%-rsi",
+            #     period=8,
+            #     pair=self.pair,
+            #     timeframe=self.config["timeframe"],
+            #     raw=True
+            # )
+
+            # # reward agent for entering trades
+            # if (action in (Actions.Long_enter.value, Actions.Short_enter.value)
+            #         and self._position == Positions.Neutral):
+            #     if rsi_now < 40:
+            #         factor = 40 / rsi_now
+            #     else:
+            #         factor = 1
+            #     return 25 * factor
+
+            # discourage agent from not entering trades
+            if action == Actions.Neutral.value and self._position == Positions.Neutral:
+                return -1
+
+            max_trade_duration = self.rl_config.get("max_trade_duration_candles", 300)
+            trade_duration = self.get_trade_duration()
+
+            if trade_duration <= max_trade_duration:
+                factor *= 1.5
+            elif trade_duration > max_trade_duration:
+                factor *= 0.5
+
+            # discourage sitting in position
+            if (
+                self._position in (Positions.Short, Positions.Long)
+                and action == Actions.Neutral.value
+            ):
+                return -1 * trade_duration / max_trade_duration
+
+            # close long
+            if action == Actions.Long_exit.value and self._position == Positions.Long:
+                if pnl > self.profit_aim * self.rr:
+                    factor *= self.rl_config.get("model_reward_parameters", {}).get(
+                        "win_reward_factor", 2
+                    )
+                return float(pnl * factor)
+
+            # close short
+            if action == Actions.Short_exit.value and self._position == Positions.Short:
+                if pnl > self.profit_aim * self.rr:
+                    factor *= self.rl_config.get("model_reward_parameters", {}).get(
+                        "win_reward_factor", 2
+                    )
+                return float(pnl * factor)
+
+            return 0.0
+
+        def _get_observation(self):
+            """
+            This may or may not be independent of action types, user can inherit
+            this in their custom "MyRLEnv"
+            """
+            features_window = self.signal_features[
+                (self._current_tick - self.window_size) : self._current_tick
+            ]
+            if self.add_state_info:
+                features_and_state = DataFrame(
+                    np.zeros(
+                        (len(features_window), len(self.state_features)),
+                        dtype=np.float32,
+                    ),
+                    columns=self.state_features,
+                    index=features_window.index,
+                )
+                # STATE_INFO
+                features_and_state["pnl"] = self.get_unrealized_profit()
+                features_and_state["position"] = self._position.value
+                features_and_state["trade_duration"] = self.get_trade_duration()
+
+                features_and_state = concat(
+                    [features_window, features_and_state], axis=1
+                )
+                return features_and_state.to_numpy(dtype=np.float32)
+            else:
+                return features_window.to_numpy(dtype=np.float32)
+
+        def _get_force_action(self):
+            if not self.force_actions or self._position == Positions.Neutral:
+                return None
+
+            trade_duration = self.get_trade_duration()
+            if trade_duration <= 1:
+                return None
+            if trade_duration >= self.timeout:
+                return ForceActions.Timeout
+
+            pnl = self.get_unrealized_profit()
+            if pnl >= self.take_profit:
+                return ForceActions.Take_profit
+            if pnl <= self.stop_loss:
+                return ForceActions.Stop_loss
+
+        def _get_new_position(self, action: int) -> Positions:
+            return {
+                Actions.Long_enter.value: Positions.Long,
+                Actions.Short_enter.value: Positions.Short,
+            }[action]
+
+        def _enter_trade(self, action):
+            self._position = self._get_new_position(action)
+            self._last_trade_tick = self._current_tick
+
+        def _exit_trade(self):
+            self._update_total_profit()
+            self._last_closed_position = self._position
+            self._position = Positions.Neutral
+            self._last_closed_trade_tick = self._last_trade_tick
+            self._last_trade_tick = None
+
+        def execute_trade(self, action: int) -> None:
+            """
+            Execute trade based on the given action
+            """
+            # Force exit trade
+            if self._force_action:
+                self._exit_trade()  # Exit trade due to force action
+                self.append_trade_history(f"{self._force_action.name}")
+                self.tensorboard_log(
+                    f"{self._force_action.name}", category="actions/force"
+                )
+                return None
+
+            if not self.is_tradesignal(action):
+                return None
+
+            # Enter trade based on action
+            if action in (Actions.Long_enter.value, Actions.Short_enter.value):
+                self._enter_trade(action)
+                self.append_trade_history(f"{self._position.name}_enter")
+
+            # Exit trade based on action
+            if action in (Actions.Long_exit.value, Actions.Short_exit.value):
+                self._exit_trade()
+                self.append_trade_history(f"{self._last_closed_position.name}_exit")
+
+        def step(self, action: int):
+            """
+            Take a step in the environment based on the provided action
+            """
+            self.tensorboard_log(Actions._member_names_[action], category="actions")
+            self._current_tick += 1
+            self._update_unrealized_total_profit()
+            self._force_action = self._get_force_action()
+            reward = self.calculate_reward(action)
+            self.total_reward += reward
+            info = {
+                "tick": self._current_tick,
+                "position": self._position.value,
+                "action": action,
+                "force_action": self._get_force_action(),
+                "pnl": self.get_unrealized_profit(),
+                "reward": round(reward, 5),
+                "total_reward": round(self.total_reward, 5),
+                "total_profit": round(self._total_profit, 5),
+                "idle_duration": self.get_idle_duration(),
+                "trade_duration": self.get_trade_duration(),
+                "trade_count": len(self.trade_history),
+            }
+            self.execute_trade(action)
+            self._position_history.append(self._position)
+            self._update_history(info)
+            return (
+                self._get_observation(),
+                reward,
+                self.is_terminated(),
+                self.is_truncated(),
+                info,
+            )
+
+        def append_trade_history(self, trade_type: str):
+            self.trade_history.append(
+                {
+                    "tick": self._current_tick,
+                    "type": trade_type.lower(),
+                    "price": self.current_price(),
+                    "profit": self.get_unrealized_profit(),
+                }
+            )
+
+        def is_terminated(self) -> bool:
+            return bool(
+                self._current_tick == self._end_tick
+                or self._total_profit <= self.max_drawdown
+                or self._total_unrealized_profit <= self.max_drawdown
+            )
+
+        def is_truncated(self) -> bool:
+            return False
+
+        def is_tradesignal(self, action: int) -> bool:
+            """
+            Determine if the action is entry or exit
+            """
+            return (
+                (
+                    action in (Actions.Short_enter.value, Actions.Long_enter.value)
+                    and self._position == Positions.Neutral
+                )
+                or (
+                    action == Actions.Long_exit.value
+                    and self._position == Positions.Long
+                )
+                or (
+                    action == Actions.Short_exit.value
+                    and self._position == Positions.Short
+                )
+            )
+
+        def _is_valid(self, action: int) -> bool:
+            """
+            Determine if the action is valid for the step
+            """
+            return not (
+                (
+                    action in (Actions.Short_enter.value, Actions.Long_enter.value)
+                    and self._position != Positions.Neutral
+                )
+                or (
+                    action == Actions.Long_exit.value
+                    and self._position != Positions.Long
+                )
+                or (
+                    action == Actions.Short_exit.value
+                    and self._position != Positions.Short
+                )
+            )
+
+        def get_feature_value(
+            self,
+            name: str,
+            period: int = 0,
+            shift: int = 0,
+            pair: str = "",
+            timeframe: str = "",
+            raw: bool = True,
+        ) -> float:
+            """
+            Get feature value
+            """
+            feature_parts = [name]
+            if period:
+                feature_parts.append(f"_{period}")
+            if shift:
+                feature_parts.append(f"_shift-{shift}")
+            if pair:
+                feature_parts.append(f"_{pair.replace(':', '')}")
+            if timeframe:
+                feature_parts.append(f"_{timeframe}")
+            feature_col = "".join(feature_parts)
+
+            if not raw:
+                return self.signal_features[feature_col].iloc[self._current_tick]
+            return self.raw_features[feature_col].iloc[self._current_tick]
+
+        def get_idle_duration(self) -> int:
+            if self._position != Positions.Neutral:
+                return 0
+            if not self._last_closed_trade_tick:
+                return self._current_tick - self._start_tick
+            return self._current_tick - self._last_closed_trade_tick
+
+        def get_most_recent_return(self) -> float:
+            """
+            Calculate the tick to tick return if in a trade.
+            Return is generated from rising prices in Long
+            and falling prices in Short positions.
+            The actions Sell/Buy or Hold during a Long position trigger the sell/buy-fee.
+            """
+            if self._position == Positions.Long:
+                current_price = self.prices.iloc[self._current_tick].open
+                previous_price = self.prices.iloc[self._current_tick - 1].open
+                if (
+                    self._position_history[self._current_tick - 1] == Positions.Short
+                    or self._position_history[self._current_tick - 1]
+                    == Positions.Neutral
+                ):
+                    previous_price = self.add_entry_fee(previous_price)
+                return np.log(current_price) - np.log(previous_price)
+            if self._position == Positions.Short:
+                current_price = self.prices.iloc[self._current_tick].open
+                previous_price = self.prices.iloc[self._current_tick - 1].open
+                if (
+                    self._position_history[self._current_tick - 1] == Positions.Long
+                    or self._position_history[self._current_tick - 1]
+                    == Positions.Neutral
+                ):
+                    previous_price = self.add_exit_fee(previous_price)
+                return np.log(previous_price) - np.log(current_price)
+            return 0.0
+
+        def get_most_recent_profit(self) -> float:
+            """
+            Calculate the tick to tick unrealized profit if in a trade
+            """
+            if self._position == Positions.Long:
+                current_price = self.add_exit_fee(self.current_price())
+                previous_price = self.add_entry_fee(self.previous_price())
+                return (current_price - previous_price) / previous_price
+            elif self._position == Positions.Short:
+                current_price = self.add_entry_fee(self.current_price())
+                previous_price = self.add_exit_fee(self.previous_price())
+                return (previous_price - current_price) / previous_price
+            return 0.0
+
+        def previous_price(self) -> float:
+            return self.prices.iloc[self._current_tick - 1].open
+
+        def get_env_history(self) -> DataFrame:
+            """
+            Get environment data from the first to the last trade
+            """
+            _history_df = DataFrame.from_dict(self.history)
+            _trade_history_df = DataFrame.from_dict(self.trade_history)
+            _rollout_history = _history_df.merge(
+                _trade_history_df, on="tick", how="left"
+            )
+            _price_history = (
+                self.prices.iloc[_rollout_history.tick].copy().reset_index()
+            )
+            history = merge(
+                _rollout_history, _price_history, left_index=True, right_index=True
+            )
+            return history
+
+        def get_env_plot(self):
+            """
+            Plot trades and environment data
+            """
+
+            def transform_y_offset(ax, offset):
+                return mtransforms.offset_copy(ax.transData, fig=fig, y=offset)
+
+            def plot_markers(ax, ticks, marker, color, size, offset):
+                ax.plot(
+                    ticks,
+                    marker=marker,
+                    color=color,
+                    markersize=size,
+                    fillstyle="full",
+                    transform=transform_y_offset(ax, offset),
+                    linestyle="none",
+                )
+
+            plt.style.use("dark_background")
+            fig, axs = plt.subplots(
+                nrows=5,
+                ncols=1,
+                figsize=(14, 8),
+                height_ratios=[5, 1, 1, 1, 1],
+                sharex=True,
+            )
+
+            # Return empty fig if no trades
+            if len(self.trade_history) == 0:
+                return fig
+
+            history = self.get_env_history()
+
+            enter_long_prices = history.loc[history["type"] == "long_enter"]["price"]
+            enter_short_prices = history.loc[history["type"] == "short_enter"]["price"]
+            exit_long_prices = history.loc[history["type"] == "long_exit"]["price"]
+            exit_short_prices = history.loc[history["type"] == "short_exit"]["price"]
+            take_profit_prices = history.loc[history["type"] == "take_profit"]["price"]
+            stop_loss_prices = history.loc[history["type"] == "stop_loss"]["price"]
+            timeout_prices = history.loc[history["type"] == "timeout"]["price"]
+            axs[0].plot(history["open"], linewidth=1, color="orchid")
+
+            plot_markers(axs[0], enter_long_prices, "^", "forestgreen", 5, -0.1)
+            plot_markers(axs[0], enter_short_prices, "v", "firebrick", 5, 0.1)
+            plot_markers(axs[0], exit_long_prices, ".", "cornflowerblue", 4, 0.1)
+            plot_markers(axs[0], exit_short_prices, ".", "thistle", 4, -0.1)
+            plot_markers(axs[0], take_profit_prices, "*", "lime", 8, 0.1)
+            plot_markers(axs[0], stop_loss_prices, "x", "red", 8, -0.1)
+            plot_markers(axs[0], timeout_prices, "1", "yellow", 8, 0)
+
+            axs[1].set_ylabel("pnl")
+            axs[1].plot(history["pnl"], linewidth=1, color="gray")
+            axs[1].axhline(y=0, label="0", alpha=0.33, color="gray")
+            axs[1].axhline(y=self.take_profit, label="tp", alpha=0.33, color="green")
+            axs[1].axhline(y=self.stop_loss, label="sl", alpha=0.33, color="red")
+
+            axs[2].set_ylabel("reward")
+            axs[2].plot(history["reward"], linewidth=1, color="gray")
+            axs[2].axhline(y=0, label="0", alpha=0.33)
+
+            axs[3].set_ylabel("total_profit")
+            axs[3].plot(history["total_profit"], linewidth=1, color="gray")
+            axs[3].axhline(y=1, label="0", alpha=0.33)
+
+            axs[4].set_ylabel("total_reward")
+            axs[4].plot(history["total_reward"], linewidth=1, color="gray")
+            axs[4].axhline(y=0, label="0", alpha=0.33)
+            axs[4].set_xlabel("tick")
+
+            _borders = ["top", "right", "bottom", "left"]
+            for _ax in axs:
+                for _border in _borders:
+                    _ax.spines[_border].set_color("#5b5e4b")
+
+            fig.suptitle(
+                f"Total Reward: {self.total_reward:.2f} ~ "
+                + f"Total Profit: {self._total_profit:.2f} ~ "
+                + f"Trades: {len(self.trade_history)}"
+            )
+            fig.tight_layout()
+            return fig
+
+        def close(self) -> None:
+            plt.close()
+            gc.collect()
+            th.cuda.empty_cache()
+
+
+class InfoMetricsCallback(TensorboardCallback):
+    """
+    Tensorboard callback
+    """
+
+    def _on_training_start(self) -> None:
+        _lr = self.model.learning_rate
+        hparam_dict = {
+            "algorithm": self.model.__class__.__name__,
+            "learning_rate": _lr if _lr is float else "lr_schedule",
+            # "gamma": self.model.gamma,
+            # "gae_lambda": self.model.gae_lambda,
+            # "n_steps": self.model.n_steps,
+            # "batch_size": self.model.batch_size,
+        }
+        metric_dict = {
+            "info/total_reward": 0,
+            "info/total_profit": 1,
+            "info/trade_count": 0,
+            "info/trade_duration": 0,
+        }
+        self.logger.record(
+            "hparams",
+            HParam(hparam_dict, metric_dict),
+            exclude=("stdout", "log", "json", "csv"),
+        )
+
+    def _on_step(self) -> bool:
+        local_info = self.locals["infos"][0]
+        if self.training_env is None:
+            return True
+        tensorboard_metrics = self.training_env.get_attr("tensorboard_metrics")[0]
+        for metric in local_info:
+            if metric not in ["episode", "terminal_observation", "TimeLimit.truncated"]:
+                self.logger.record(f"info/{metric}", local_info[metric])
+        for category in tensorboard_metrics:
+            for metric in tensorboard_metrics[category]:
+                self.logger.record(
+                    f"{category}/{metric}", tensorboard_metrics[category][metric]
+                )
+        return True
+
+
+class RolloutPlotCallback(BaseCallback):
+    """
+    Tensorboard plot callback
+    """
+
+    def record_env(self):
+        figures = self.training_env.env_method("get_env_plot")
+        for i, fig in enumerate(figures):
+            figure = Figure(fig, close=True)
+            self.logger.record(
+                f"best/train_env_{i}", figure, exclude=("stdout", "log", "json", "csv")
+            )
+        return True
+
+    def _on_step(self) -> bool:
+        return self.record_env()
+
+
+class MaskableTrialEvalCallback(MaskableEvalCallback):
+    """
+    Optuna maskable trial eval callback
+    """
+
+    def __init__(
+        self,
+        eval_env,
+        trial,
+        n_eval_episodes: int = 10,
+        eval_freq: int = 2048,
+        deterministic: bool = True,
+        use_masking: bool = True,
+        verbose: int = 0,
+        **kwargs,
+    ):
+        super().__init__(
+            eval_env=eval_env,
+            n_eval_episodes=n_eval_episodes,
+            eval_freq=eval_freq,
+            deterministic=deterministic,
+            verbose=verbose,
+            use_masking=use_masking,
+            **kwargs,
+        )
+        self.trial = trial
+        self.eval_idx = 0
+        self.is_pruned = False
+
+    def _on_step(self) -> bool:
+        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
+            super()._on_step()
+            self.eval_idx += 1
+            self.trial.report(self.last_mean_reward, self.eval_idx)
+
+            # Prune trial if need
+            if self.trial.should_prune():
+                self.is_pruned = True
+                return False
+
+        return True
+
+
+def make_env(
+    MyRLEnv: Type[BaseEnvironment],
+    env_id: str,
+    rank: int,
+    seed: int,
+    train_df: DataFrame,
+    price: DataFrame,
+    env_info: Dict[str, Any],
+) -> Callable:
+    """
+    Utility function for multiprocessed env.
+
+    :param env_id: (str) the environment ID
+    :param num_env: (int) the number of environment you wish to have in subprocesses
+    :param seed: (int) the inital seed for RNG
+    :param rank: (int) index of the subprocess
+    :param env_info: (dict) all required arguments to instantiate the environment.
+    :return: (Callable)
+    """
+
+    def _init() -> Env:
+        env = MyRLEnv(
+            df=train_df, prices=price, id=env_id, seed=seed + rank, **env_info
+        )
+        return env
+
+    set_random_seed(seed)
+    return _init
+
+
+def linear_schedule(initial_value: float) -> Callable[[float], float]:
+    def func(progress_remaining: float) -> float:
+        return progress_remaining * initial_value
+
+    return func
+
+
+def hours_to_seconds(hours):
+    """
+    Converts hours to seconds
+    """
+    seconds = hours * 3600
+    return seconds
+
+
+def steps_to_days(steps: int, timeframe: str) -> float:
+    """
+    Calculate the number of days based on the given number of steps
+    """
+    total_minutes = steps * timeframe_to_minutes(timeframe)
+    days = total_minutes / (24 * 60)
+    return round(days, 1)
+
+
+def sample_params_ppo(trial: Trial) -> Dict[str, Any]:
+    """
+    Sampler for PPO hyperparams
+    """
+    batch_size = trial.suggest_categorical("batch_size", [64, 256, 512, 1024, 10240])
+    n_steps = trial.suggest_categorical("n_steps", [512, 1024, 10240])
+    gamma = trial.suggest_float("gamma", 0.1, 0.99, step=0.01)
+    learning_rate = trial.suggest_float("learning_rate", 1e-6, 0.01, log=True)
+    ent_coef = trial.suggest_float("ent_coef", 0.001, 0.1, log=True)
+    clip_range = trial.suggest_categorical("clip_range", [0.1, 0.2, 0.3, 0.4, 0.5])
+    n_epochs = trial.suggest_categorical("n_epochs", [1, 5, 10, 20])
+    gae_lambda = trial.suggest_float("gae_lambda", 0.1, 0.99, step=0.01)
+    max_grad_norm = trial.suggest_float("max_grad_norm", 0.1, 5, step=0.01)
+    vf_coef = trial.suggest_float("vf_coef", 0, 1, step=0.01)
+    net_arch_type = trial.suggest_categorical("net_arch", ["small", "medium", "large"])
+    # ortho_init = True
+    ortho_init = trial.suggest_categorical("ortho_init", [False, True])
+    activation_fn_name = trial.suggest_categorical(
+        "activation_fn", ["tanh", "relu", "elu", "leaky_relu"]
+    )
+    lr_schedule = trial.suggest_categorical("lr_schedule", ["linear", "constant"])
+    if lr_schedule == "linear":
+        learning_rate = linear_schedule(learning_rate)
+
+    cr_schedule = trial.suggest_categorical("cr_schedule", ["linear", "constant"])
+    if cr_schedule == "linear":
+        clip_range = linear_schedule(clip_range)
+    if batch_size > n_steps:
+        batch_size = n_steps
+    net_arch = {
+        "small": {"pi": [128, 128], "vf": [128, 128]},
+        "medium": {"pi": [256, 256], "vf": [256, 256]},
+        "large": {"pi": [512, 512], "vf": [512, 512]},
+    }[net_arch_type]
+
+    activation_fn = {
+        "tanh": th.nn.Tanh,
+        "relu": th.nn.ReLU,
+        "elu": th.nn.ELU,
+        "leaky_relu": th.nn.LeakyReLU,
+    }[activation_fn_name]
+    return {
+        "n_steps": n_steps,
+        "batch_size": batch_size,
+        "gamma": gamma,
+        "learning_rate": learning_rate,
+        "ent_coef": ent_coef,
+        "clip_range": clip_range,
+        "n_epochs": n_epochs,
+        "gae_lambda": gae_lambda,
+        "max_grad_norm": max_grad_norm,
+        "vf_coef": vf_coef,
+        "policy_kwargs": dict(
+            net_arch=net_arch,
+            activation_fn=activation_fn,
+            ortho_init=ortho_init,
+        ),
+    }
+
+
+def sample_params_dqn(trial: Trial) -> Dict[str, Any]:
+    """
+    Sampler for DQN hyperparams
+    """
+    gamma = trial.suggest_categorical(
+        "gamma", [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999]
+    )
+    learning_rate = trial.suggest_float("learning_rate", 1e-6, 0.01, log=True)
+    batch_size = trial.suggest_categorical("batch_size", [64, 256, 512])
+    buffer_size = trial.suggest_categorical(
+        "buffer_size", [int(1e4), int(5e4), int(1e5)]
+    )
+    exploration_final_eps = trial.suggest_float(
+        "exploration_final_eps", 0, 0.2, step=0.1
+    )
+    exploration_fraction = trial.suggest_float("exploration_fraction", 0, 0.5, step=0.1)
+    target_update_interval = trial.suggest_categorical(
+        "target_update_interval", [1000, 5000, 10000]
+    )
+    learning_starts = trial.suggest_categorical("learning_starts", [1000, 5000, 10000])
+    train_freq = trial.suggest_categorical("train_freq", [1, 4, 8, 16, 128, 256, 1000])
+    subsample_steps = trial.suggest_categorical("subsample_steps", [1, 2, 4, 8])
+    gradient_steps = max(train_freq // subsample_steps, 1)
+    net_arch_type = trial.suggest_categorical("net_arch", ["small", "medium", "large"])
+    net_arch = {
+        "small": [128, 128],
+        "medium": [256, 256],
+        "large": [512, 512],
+    }[net_arch_type]
+    return {
+        "gamma": gamma,
+        "learning_rate": learning_rate,
+        "batch_size": batch_size,
+        "buffer_size": buffer_size,
+        "train_freq": train_freq,
+        "gradient_steps": gradient_steps,
+        "exploration_fraction": exploration_fraction,
+        "exploration_final_eps": exploration_final_eps,
+        "target_update_interval": target_update_interval,
+        "learning_starts": learning_starts,
+        "policy_kwargs": dict(net_arch=net_arch),
+    }
+
+
+def sample_params_qrdqn(trial: Trial) -> Dict[str, Any]:
+    """
+    Sampler for QRDQN hyperparams
+    """
+    hyperparams = sample_params_dqn(trial)
+    n_quantiles = trial.suggest_int("n_quantiles", 5, 200)
+    hyperparams["policy_kwargs"].update({"n_quantiles": n_quantiles})
+    return hyperparams
diff --git a/ReforceXY/user_data/hyperopt_results/.gitkeep b/ReforceXY/user_data/hyperopt_results/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/ReforceXY/user_data/hyperopts/.gitkeep b/ReforceXY/user_data/hyperopts/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/ReforceXY/user_data/logs/.gitkeep b/ReforceXY/user_data/logs/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/ReforceXY/user_data/models/.gitkeep b/ReforceXY/user_data/models/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/ReforceXY/user_data/notebooks/.gitkeep b/ReforceXY/user_data/notebooks/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/ReforceXY/user_data/plot/.gitkeep b/ReforceXY/user_data/plot/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/ReforceXY/user_data/strategies/RLAgentStrategy.py b/ReforceXY/user_data/strategies/RLAgentStrategy.py
new file mode 100644
index 0000000..729014b
--- /dev/null
+++ b/ReforceXY/user_data/strategies/RLAgentStrategy.py
@@ -0,0 +1,92 @@
+import logging
+from functools import reduce
+
+# import talib.abstract as ta
+from pandas import DataFrame
+
+from freqtrade.strategy import IStrategy
+
+
+logger = logging.getLogger(__name__)
+
+
+class RLAgentStrategy(IStrategy):
+    """
+    RLAgentStrategy
+    """
+
+    minimal_roi = {"0": 0.03}
+
+    process_only_new_candles = True
+    stoploss = -0.03
+    use_exit_signal = True
+    startup_candle_count: int = 300
+    can_short = False
+
+    # def feature_engineering_expand_all(
+    #     self, dataframe: DataFrame, period: int, metadata: dict, **kwargs
+    # ):
+    #     dataframe["%-rsi-period"] = ta.RSI(dataframe, timeperiod=period)
+
+    #     return dataframe
+
+    def feature_engineering_expand_basic(
+        self, dataframe: DataFrame, metadata: dict, **kwargs
+    ):
+        dataframe["%-pct-change"] = dataframe["close"].pct_change()
+        dataframe["%-raw_volume"] = dataframe["volume"]
+
+        return dataframe
+
+    def feature_engineering_standard(
+        self, dataframe: DataFrame, metadata: dict, **kwargs
+    ):
+        dataframe["%-day_of_week"] = dataframe["date"].dt.dayofweek
+        dataframe["%-hour_of_day"] = dataframe["date"].dt.hour
+
+        dataframe["%-raw_close"] = dataframe["close"]
+        dataframe["%-raw_open"] = dataframe["open"]
+        dataframe["%-raw_high"] = dataframe["high"]
+        dataframe["%-raw_low"] = dataframe["low"]
+
+        return dataframe
+
+    def set_freqai_targets(self, dataframe: DataFrame, metadata: dict, **kwargs):
+        dataframe["&-action"] = 0
+
+        return dataframe
+
+    def populate_indicators(self, dataframe: DataFrame, metadata: dict) -> DataFrame:
+        dataframe = self.freqai.start(dataframe, metadata, self)
+
+        return dataframe
+
+    def populate_entry_trend(self, df: DataFrame, metadata: dict) -> DataFrame:
+        enter_long_conditions = [df["do_predict"] == 1, df["&-action"] == 1]
+
+        if enter_long_conditions:
+            df.loc[
+                reduce(lambda x, y: x & y, enter_long_conditions),
+                ["enter_long", "enter_tag"],
+            ] = (1, "long")
+
+        enter_short_conditions = [df["do_predict"] == 1, df["&-action"] == 3]
+
+        if enter_short_conditions:
+            df.loc[
+                reduce(lambda x, y: x & y, enter_short_conditions),
+                ["enter_short", "enter_tag"],
+            ] = (1, "short")
+
+        return df
+
+    def populate_exit_trend(self, df: DataFrame, metadata: dict) -> DataFrame:
+        exit_long_conditions = [df["do_predict"] == 1, df["&-action"] == 2]
+        if exit_long_conditions:
+            df.loc[reduce(lambda x, y: x & y, exit_long_conditions), "exit_long"] = 1
+
+        exit_short_conditions = [df["do_predict"] == 1, df["&-action"] == 4]
+        if exit_short_conditions:
+            df.loc[reduce(lambda x, y: x & y, exit_short_conditions), "exit_short"] = 1
+
+        return df