HI all
I have run in to the following issue while preparing the data to be inputted in to lstm type of network , though time delta and relativedelta ary very cool it takes time abit to calculate though this is not the main issue , however I will walk you through , first through a very quick copy of code then , I will point out where the issue is
#Read in Price data, use date as index Prices = pd.read_csv("USDprices.csv") Prices.head() Prices['Date'].dtype Prices['Date'] = pd.to_datetime(Prices['Date'], format='%Y-%m-%d') Prices.head() Prices.index = Prices['Date'] # print(Prices.head()) #Reading in Sentiment Data and selecting News-only-MarketRisk Sentiment Sent = pd.read_csv("USDSentiment.csv") Sent = Sent[Sent.dataType=='News'][['Date', 'marketRisk']].fillna(method = "ffill") # Sent.head() Sent['Date'] = pd.to_datetime(Sent['Date'], format='%Y-%m-%d') Sent.index = Sent['Date'] # print(Sent.head()) #Combining Price table and Sentiment Table Ana = Prices.merge(Sent, left_index= True, right_index=True, how='inner') Ana.head() Ana.columns Ana = Ana.drop(columns=['Unnamed: 0', 'Volume', 'Asset', 'Date_x', 'Date_y', 'Open', 'High', 'Low', 'UnadjClose']) Ana.head() # calculating r, %tage change of closing price over previous time step Ana['returns'] = (Ana['Close']- Ana['Close'].shift(1))/Ana['Close'].shift(1) Ana.head() # calculating target price Ana['Target'] = Ana["Close"].diff(1).shift(-1) Ana.head() # calculating %change Ana['Change'] = Ana["returns"].shift(-1) Ana.head() # then sofar so good , then below I continue adding a label via a loop
## Label each time step False(no-buy) or True(buy) based on whether the price will rise at the next closing price timestep = 1 labels = [] for i in range(0, Ana.shape[0]): if(i+timestep< Ana.shape[0]): aheadGain = [Ana["returns"][i+j] for j in range(1,timestep+1)] labels+= [np.sum(aheadGain)> 0] print(len(labels)) # Price Table with Labels as Signal Ana = Ana.iloc[:-timestep,:].copy() Ana.head() Ana['Signal'] = labels Ana.head() # then I will add some extra features
#Feature Engineering Ana["Close30"] = Ana["Close"].rolling(30).mean() Ana["Close100"] = Ana["Close"].rolling(100).mean() Ana["r1"] = Ana["Close"].diff(1) Ana["marketrisk_avg7"] = Ana["marketRisk"].rolling(7).mean() #Drop NA Ana = Ana.dropna(0) Ana.head() then here the following is defined as a function,to divide the original dataset in to training, validating and testing examples def series_to_supervised(df, n_in=1, n_out=1): n_vars = 1 if type(df) is list else df.shape[1] cols, names = list(), list() # input sequence (t-n, ... t-1) for i in range(n_in, -1, -1): cols.append(df.shift(i)) names += [(df.columns[j]+'(t-%d)' % (i)) for j in range(n_vars)] # put it all together agg = concat(cols, axis=1) agg.columns = names return agg #Features Selection as below Set = Ana[['r1','marketrisk_avg7', 'Close30','Close100']] FEATURES_SHAPE = Set.shape[1] print(FEATURES_SHAPE) #Forming examples SEQ_LEN = 5 Set = series_to_supervised(Set, SEQ_LEN-1, 0) Set[["Change", "Signal"]] = Ana[["Change", "Signal"]] Set = Set.dropna() print(Set.shape) Set.head() SetAdjusted = Set.copy() # then due to space limitation I will continue via email , I can't all explain well here ,so I will be sending you an email including a copy of such code above Best Regards