This model is just for demonstration:
make sure you are not dropping any input
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yfinance as yf
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
import matplotlib.dates as mdates
from datetime import datetime
# Define date range
start_date = '2000-01-01'
end_date = datetime.now().strftime('%Y-%m-%d')
# Download data
print(f"Downloading data from {start_date} to {end_date}")
nifty = yf.download('^NSEI', start=start_date, end=end_date)
gold = yf.download('GLD', start=start_date, end=end_date)
# Handle MultiIndex columns if they exist
if isinstance(nifty.columns, pd.MultiIndex):
nifty.columns = nifty.columns.droplevel(1)
if isinstance(gold.columns, pd.MultiIndex):
gold.columns = gold.columns.droplevel(1)
# Calculate monthly average prices
nifty_monthly_avg = nifty['Close'].resample('M').mean()
gold_monthly_avg = gold['Close'].resample('M').mean()
# Combine data and drop any rows with missing values
data = pd.DataFrame({
'nifty': nifty_monthly_avg,
'gold': gold_monthly_avg
}).dropna()
# Create lag features and target variable
data['nifty_lag1'] = data['nifty'].shift(1)
data['gold_lag1'] = data['gold'].shift(1)
data['future_gold_price'] = data['gold'].shift(-1)
# Create dataset for predictions
features_df = data[['nifty', 'nifty_lag1', 'gold_lag1', 'gold']].copy()
features_df_clean = features_df.dropna()
# Remove rows with NaN values after creating features
data_clean = data.dropna()
# Split the data into features and target
X = data_clean[['nifty', 'nifty_lag1', 'gold_lag1', 'gold']]
y = data_clean['future_gold_price']
# Split into training and testing sets (80% train, 20% test)
train_size = int(len(data_clean) * 0.8)
X_train, X_test = X.iloc[:train_size], X.iloc[train_size:]
y_train, y_test = y.iloc[:train_size], y.iloc[train_size:]
# Create pipeline with scaling and linear regression
pipeline = Pipeline([
('scaler', StandardScaler()),
('linear', LinearRegression())
])
# Define parameters for grid search
parameters = {'linear__fit_intercept': [True, False]}
# Use TimeSeriesSplit for cross-validation
tscv = TimeSeriesSplit(n_splits=5)
# Perform grid search with cross-validation
model = GridSearchCV(
pipeline,
parameters,
scoring='neg_mean_squared_error',
cv=tscv
)
model.fit(X_train, y_train)
# Get best parameters
best_params = model.best_params_
print("Best parameters:", best_params)
# Train final model with best parameters
final_model = LinearRegression(fit_intercept=best_params['linear__fit_intercept'])
final_model.fit(X_train, y_train)
# Make predictions on the test set
y_pred = model.predict(X_test)
# Calculate RMSE for the test set
rmse = np.sqrt(np.mean((y_test - y_pred) ** 2))
print(f"Test RMSE: {rmse:.2f}")
# Compare predicted vs actual values
comparison = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(comparison.tail()) # Show the last few predictions vs actual values
# Make predictions on all available data
X_all = features_df_clean[['nifty', 'nifty_lag1', 'gold_lag1', 'gold']]
predicted_values = final_model.predict(X_all)
# Create a dataframe with predictions
prediction_df = pd.DataFrame({
'date': features_df_clean.index,
'predicted': predicted_values
})
# Merge with actual values
results_df = pd.merge(
prediction_df,
data_clean[['future_gold_price']],
left_on='date',
right_index=True,
how='left'
)
# Rename columns for clarity
results_df.columns = ['date', 'predicted', 'actual']
# Shift dates to show when predictions are for
results_df['date'] = results_df['date'] + pd.DateOffset(months=1)
# Plot the last 10 data points
last_10 = results_df.tail(10)
plt.figure(figsize=(12, 6))
# Plot predictions vs actual values
plt.plot(last_10['date'], last_10['predicted'], label='Predicted', color='blue', marker='o')
plt.plot(last_10['date'], last_10['actual'], label='Actual', color='green', marker='x')
# Customize the plot
plt.title('Predicted vs Actual Gold Prices (Last 10 Months)', fontsize=14)
plt.xlabel('Date')
plt.ylabel('Gold Price')
plt.grid(True, alpha=0.3)
plt.legend()
# Format x-axis to show readable dates
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
plt.xticks(last_10['date'], rotation=45)
plt.tight_layout()
plt.show()