Basic RL algo connection to Binance Exchange

Hello!  Hope everyone's development codes are working smoothly.  I am currently trying to bridge a basic Reinforcement Learning algorithm with Binance exchange API to paper trade the algorithm.  I have posted several very basic RL algos, that can have very little in feature engineering and are readily available on the internet, so I feel I'm not sharing anything too deep on the trading deck of RL trading algos.  Hoping to find someone who can help me on my journey to bridge these historical back testing RL algos to an actual live paper trading account on Binance.  I'll be working on it on the side, and if I find anything before a solution is posted, will be willing to share it back here with all of you :slight_smile:  Thanks in advance, and feel free to do just 1 or both!  



P.S.:  The granularity of the code is rather big, so I'm not looking for a solution overnight.  I understand this may take some serious time, and I am willing to check back on this post over the year periodically to add things that I've learned, and to see if anybody else has come up with creative solutions :slight_smile:



P.P.S.:  I find that RL 2 is the in my personal opinion, the best algo overall, in that it provides the ability to both go long and short an equity.  However, since you cannot short crypto except through a secondary less exposed futures market, I understand if you wish to cut out the shorting code, in that it is not necessary if you are directly trading crypto.





RL 1:

import numpy as np
import pandas as pd

# create timesteps
time = np.arange (0, 50, 0.1)

# Assign amplitude and normalise above 0
amplitude = np.sin(time)
amplitude = amplitude + 1
max_amp = max(amplitude)
amplitude = amplitude / max_amp

# Construct Dataframe
df = pd.DataFrame(amplitude)
df.columns = ["Close"]
df["Close_Rt"] = df ["Close"].pct_change()
df = df.replace(np.inf, np.nan)
df = df.dropna()
df = df.reset_index(drop=True)

# Show dataframe and values
print(f"Length: {len(df)}")
print("Min Close: ", df["Close"].min())
print("Max Close: ", df["Close"].max())
df.head(2)

import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (15,3)
df["Close"].plot()

# ENVIRONMENT SETUP AND CLASS
import gym
from gym import spaces # Use Gym 0.24.0 from pip install
import numpy as np
import random
import torch

# Initialize Variabls
MAX_INT = 2147483647
MAX_OPEN_POSITIONS = 1
INITIAL_ACCOUNT_BALANCE = 1000
PERCENT_CAPITAL = 0.1
TRADING_COSTS_RATE = 0.001
KILL_THRESH = 0.4 # Terminat if balance too low

# Build Environment Class
class StockTradingEnv(gym.Env):
    """ A stock trading environment with Open AI gym"""
    metadata = {'render.modes': ['human']}
    
    def __init__(self, df):
        super(StockTradingEnv, self).__init__()
        
        # Generic variabls
        self.df = df
        
        # Account variables
        self.account_balance = INITIAL_ACCOUNT_BALANCE
        self.net_worth = INITIAL_ACCOUNT_BALANCE
        self.realized_profit = 0
        self.unrealized_profit = 0
        self.last_profit = 0
        
        # Position Variables
        self.open_quantities = []
        self.open_prices = []
        self.trading_costs = 0
        self.open_positions = 0
        self.closed_positions = 0
        self.incorrect_position_calls = 0
        self.num_trades = 0
        self.held_for_period = 0
        
        # Current Step
        self.current_step = 0
        self.max_steps = len(df)
        
        # Actions of the format Long, Hold and Close
        self.action_space = spaces.Discrete(3)
        
        # Setup observation Space
        self.observation_space = spaces.Box(low=-1, high=1, shape=(8,), dtype=np.float32)
        
    
    # Reward Structure
    def calculate_reward(self):
        reward = 0
        if self.num_trades:
            reward += self.realized_profit / self.num_trades
            reward += self.unrealized_profit /self.num_trades * 0.3
            reward += 1 if self.last_profit > 0 else 0
        reward -= 2 if self.incorrect_position_calls > 0 else 0
        if reward <= 0:
            reward -= 2
        return reward
    
    # Observation Structure
    def _next_observation(self):
        close_item = self.df.loc[self.current_step, "Close"].item()
        close_rt_item = self.df.loc[self.current_step, "Close_Rt"].item()
        close_T1_item = self.df.loc[self.current_step - 1, "Close_Rt"].item()
        close_T2_item = self.df.loc[self.current_step - 2, "Close_Rt"].item()
        close_T3_item = self.df.loc[self.current_step - 3, "Close_Rt"].item()
        close_T4_item = self.df.loc[self.current_step - 4, "Close_Rt"].item()
        
        current_position = 1 if self.open_positions else 0
        num_trades = self.num_trades / len(self.df) if self.num_trades > 0 else 0
        
        obs = np.array([close_item,close_rt_item,close_T1_item,close_T2_item,close_T3_item,close_T4_item,current_position,num_trades])
        
        return obs
        
    # Calculate the open positions value
    def _calculate_open_value(self):
        open_trades_value = 0
        counts = 0
        for qty in self.open_quantities:
            acquisition_price = self.open_prices[counts]
            open_trades_value += acquisition_price * qty
            counts += 1
        return open_trades_value
    
    # Calculate net profit
    def _profit_calculation(self, current_price, calc_type):
        open_trades_value = self._calculate_open_value()
        total_quantity_held = sum(self.open_quantities)
        current_value = total_quantity_held * current_price
        gross_profit = current_value - open_trades_value
        
        if calc_type == "close_position":
            trading_costs = current_value * TRADING_COSTS_RATE
            self.trading_costs += trading_costs
        elif calc_type == "hold_position" or calc_type == "open_position":
            trading_costs = open_trades_value * TRADING_COSTS_RATE
            
        net_profit = gross_profit - trading_costs
        
        return net_profit
    
    # Action Management
    def _take_action(self, action):
        current_price = self.df.loc[self.current_step, "Close"].item()
        
        # Reset last profit
        self.last_profit = 0
        self.incorrect_position_calls = 0
        
        # Go Long
        if action == 0:
            if self.open_positions < MAX_OPEN_POSITIONS:
                net_profit = self._profit_calculation(current_price, "open_position")
                net_worth = self.net_worth + net_profit
                trading_allowance = net_worth * PERCENT_CAPITAL
                
                self.open_quantities.append(trading_allowance / current_price)
                self.open_prices.append(current_price)
                self.trading_costs += trading_allowance * TRADING_COSTS_RATE
                self.num_trades += 1

Sorry, the second snippet of code did not post, so I'm posting it here:



RL 2:

 

# Data Preprocessing
import pandas as pd
from pandas_datareader.data import DataReader
from ta.volume import VolumeWeightedAveragePrice

# Environment
import gym
from gym import spaces
import numpy as np
import random
import torch

# PyTorch
import os
import numpy as np
import torch as T
import torch.nn as nn
import torch.optim as optim
from torch.distributions.categorical import Categorical

# Outputs
import matplotlib.pyplot as plt

# Data Extraction

from pandas_datareader import data as pdr

import yfinance as yf
yf.pdr_override() # <== that's all it takes


start_date = "2017-01-1"
end_date = "2022-06-01"
symbol = "AAPL"
df = pdr.get_data_yahoo("AAPL", start="2017-01-01", end="2022-06-01")
df.drop(columns=["Adj Close"], inplace=True)
df.head(2)

# Add VWAP to DataFrame
vwap = VolumeWeightedAveragePrice(high=df["High"], low=df["Low"], close=df["Close"], 
                                  volume=df["Volume"], window=14, fillna=False)
df["VWAP"] = vwap.volume_weighted_average_price()
df.dropna(inplace=True)
df.head(2)

# Min Max Scaled
df_mod = df.copy()
df_mod = df_mod.pct_change() * 100
df_mod = df_mod / df_mod.max()
df_mod = df_mod.dropna()
df_mod = df_mod.reset_index(drop=True)
df_mod["Close_Price"] = df["Close"].iloc[1:].values
df_mod.head()

# Split Training and Testing
df_train = df_mod.copy()
df_train = df_train.iloc[:700]
df_test = df_mod.copy()
df_test = df_test.iloc[700:]

# View price behaviour
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (15,5)
df_train["Close_Price"].plot()
df_test["Close_Price"].plot()

# Initialise variables
MAX_INT = 2147483647
MAX_TRADES = 10000
MAX_OPEN_POSITIONS = 1
INITIAL_ACCOUNT_BALANCE = 1000
PERCENT_CAPITAL = 0.1
TRADING_COSTS_RATE = 0.001
KILL_THRESH = 0.4 # Threshold for balance preservation


# Structure environment
class StockTradingEnv(gym.Env):
    """A stock trading environment for OpenAI gym"""
    metadata = {'render.modes': ['human']}

    def __init__(self, df):
        super(StockTradingEnv, self).__init__()
        
        # Generic variables
        self.df = df
        
        # Account variables
        self.available_balance = INITIAL_ACCOUNT_BALANCE
        self.net_profit = 0
        
        # Position variables
        self.num_trades_long = 0
        self.num_trades_short = 0
        self.long_short_ratio = 0
        
        # Current Step
        self.current_step = 0
        self.lag = 20
        self.volatility = 1
        self.max_steps = len(df)

        # Actions of the format Long, Hold, Close
        self.action_space = spaces.Discrete(2)

        # Prices contains the Close and Close Returns etc
        self.observation_space = spaces.Box(low=-1, high=1, shape=(7, ), dtype=np.float32)

    # Calculate Reward
    def _calculate_reward(self):
        reward = 0
        reward += self.net_profit / self.volatility
        reward += 0.01 if self.long_short_ratio >= 0.3 and self.long_short_ratio <= 0.6 else -0.01
        return reward
        
    # Structure sign observation data
    def _next_observation(self):
        
        item_0_T0 = self.df.loc[self.current_step - 0, "Open"].item()
        item_1_T0 = self.df.loc[self.current_step - 0, "High"].item()       
        item_2_T0 = self.df.loc[self.current_step - 0, "Low"].item()
        item_3_T0 = self.df.loc[self.current_step - 0, "Close"].item()
        item_4_T0 = self.df.loc[self.current_step - 0, "Volume"].item()
        item_5_T0 = self.df.loc[self.current_step - 0, "VWAP"].item()
        
        env_4 = 1 if self.long_short_ratio else 0
        
        obs = np.array([item_0_T0, item_1_T0, item_2_T0, item_3_T0, item_4_T0, item_5_T0, env_4])
        
        return obs

    # Set the current price to a random price within the time step
    def _take_action(self, action):
        current_price = self.df.loc[self.current_step, "Close_Price"].item()
        next_price = self.df.loc[self.current_step + 1, "Close_Price"].item()
        next_return = next_price / current_price - 1
        
        # Go Long
        if action == 0:
            self.net_profit += self.available_balance * PERCENT_CAPITAL * next_return
            self.available_balance += self.net_profit
            self.num_trades_long += 1
                
        # Go Short
        if action == 1:
            self.net_profit += self.available_balance * PERCENT_CAPITAL * -next_return
            self.available_balance += self.net_profit
            self.num_trades_short += 1
        
        # Update metrics
        self.long_short_ratio = self.num_trades_long / (self.num_trades_long + self.num_trades_short)
        self.volatility = self.df.loc[self.current_step - self.lag, "Close_Price"].sum()

    # Execute one time step within the environment
    def step(self, action):
        self._take_action(action)

        reward = self._calculate_reward()
    
        self.current_step += 1
        
        is_max_steps_taken = self.current_step >= self.max_steps - self.lag - 1
        done = True if is_max_steps_taken else False
        
        obs = self._next_observation()

        return obs, reward, done, {}

    # Reset the state of the environment to an initial state
    def reset(self):
        self.available_balance = INITIAL_ACCOUNT_BALANCE
        self.net_profit = 0
        self.current_step = self.lag
        self.num_trades_long = 0
        self.num_trades_short = 0
        self.num_trades_ratio = 0

        return self._next_observation()

    # Render the environment to the screen
    def render(self, mode='human', close=False):
        pass

# Test Environment
env = StockTradingEnv(df_train)
actions = [0,1,2]
observation = env.reset()
print("Initial Observation: ", env.reset())

for action in actions:
    obs_, reward, done, info = env.step(action)
    print("")
    print("Action Taken: ", action)
    print("Reward Received : ", reward)
    print("Next State: ", obs_)
    print("Completed: ", done)
    print("-------------------")
    print("Available Balance: ", env.available_balance)
    print("Realized Profit: ", env.net_profit)
    print("Ratio: ", env.num_trades_ratio)
    print("-------------------")

class PPOMemory:
    def __init__(self, batch_size):
        self.states = []
        self.probs = []
        self.vals = []
        self.actions = []
        self.rewards = []
        self.dones = []

        self.batch_size = batch_size

    def generate_batches(self):
        n_states = len(self.states)
        batch_start = np.arange(0, n_states, self.batch_size)
        indices = np.arange(n_states, dtype=np.int64)
        np.random.shuffle(indices)
        batches = [indices[i:i+self.batch_size] for i in batch_start]

        return np.array(self.states),\
                np.array(self.actions),\
                np.array(self.probs),\
                np.array(self.vals),\

Hello Michael,



This sounds like an interesting project! It's great that you're trying to minimize feature engineering, as that can often be a time-consuming and difficult process.



I feel that the API documentation of Binance is decent so you should not be facing any issue when it comes to using your strategy for paper trading. You can look at combining different agents and using them in the paper trading environment. But that will be further in the future.



All in all, I wish you the best and looking forward to seeing your progress.