From 1c9500e29a63e596c6b8188350675eebae137e17 Mon Sep 17 00:00:00 2001 From: tcsenpai Date: Tue, 10 Sep 2024 19:26:35 +0200 Subject: [PATCH] - Improved LSTM and GRU architectures for better efficiency. - Implemented early stopping for neural networks to prevent overfitting. - Added more technical indicators for enhanced feature engineering. - Introduced a weighted ensemble method for combining model predictions. - Implemented a simple trading strategy based on predictions. - Enhanced visualization to include trading strategy performance. --- README.md | 26 ++++++++++---- goldigger.py | 96 +++++++++++++++++++++++++++++++++++++++------------- 2 files changed, 92 insertions(+), 30 deletions(-) diff --git a/README.md b/README.md index 7bbec90..fdc8c5e 100644 --- a/README.md +++ b/README.md @@ -11,26 +11,40 @@ goldigger is a sophisticated Python-based tool designed for stock price predicti ## Features + - **Data Fetching**: Automatically retrieves historical stock data from Yahoo Finance. - **Technical Indicators**: Calculates and incorporates various technical indicators for enhanced analysis. - **Multiple ML Models**: Utilizes LSTM, GRU, Random Forest, and XGBoost models for prediction. -- **Ensemble Prediction**: Combines predictions from multiple models for improved accuracy. +- **Ensemble Prediction**: Combines predictions from multiple models using a weighted approach for improved accuracy. - **Hyperparameter Tuning**: Implements randomized search for optimizing Random Forest and XGBoost models. - **Time Series Cross-Validation**: Ensures robust model evaluation respecting temporal order of data. - **Risk Metrics**: Calculates Sharpe ratio and maximum drawdown for risk assessment. - **Future Price Prediction**: Forecasts stock prices for a specified number of future days. -- **Visualization**: Generates plots showing actual prices, predictions, and future forecasts. +- **Visualization**: Generates plots showing actual prices, predictions, future forecasts, and trading strategy performance. - **Performance Summary**: Provides a detailed table of model performance metrics. +- **Trading Strategy**: Implements a simple trading strategy based on predictions and calculates its performance. + ## Key Components 1. **Data Preparation**: Fetches stock data, adds technical indicators, and prepares sequences for model input. -2. **Model Creation**: Implements LSTM, GRU, Random Forest, and XGBoost models. -3. **Model Training and Evaluation**: Uses time series cross-validation for robust performance assessment. +2. **Model Creation**: Implements LSTM, GRU, Random Forest, and XGBoost models with optimized architectures. +3. **Model Training and Evaluation**: Uses time series cross-validation for robust performance assessment, including one-step-ahead predictions for LSTM and GRU models. 4. **Hyperparameter Tuning**: Optimizes Random Forest and XGBoost models using randomized search. -5. **Ensemble Prediction**: Combines predictions from all models for final forecast. +5. **Ensemble Prediction**: Combines predictions from all models using a weighted approach for final forecast. 6. **Risk Analysis**: Calculates key risk metrics for informed decision-making. -7. **Visualization**: Plots results and generates a performance summary table. +7. **Trading Strategy**: Implements a simple trading strategy based on model predictions. +8. **Visualization**: Plots results, generates a performance summary table, and displays trading strategy performance. + +## Recent Enhancements + +- Improved LSTM and GRU architectures for better efficiency. +- Implemented early stopping for neural networks to prevent overfitting. +- Added more technical indicators for enhanced feature engineering. +- Introduced a weighted ensemble method for combining model predictions. +- Implemented a simple trading strategy based on predictions. +- Enhanced visualization to include trading strategy performance. + ## Customization diff --git a/goldigger.py b/goldigger.py index 6d036c0..8fcc336 100644 --- a/goldigger.py +++ b/goldigger.py @@ -6,7 +6,7 @@ from sklearn.preprocessing import MinMaxScaler from sklearn.metrics import mean_squared_error, r2_score from tensorflow.keras.models import Sequential, clone_model as keras_clone_model from tensorflow.keras.layers import LSTM, Dense, GRU -from tensorflow.keras.callbacks import Callback +from tensorflow.keras.callbacks import Callback, EarlyStopping from datetime import datetime, timedelta from tqdm.auto import tqdm import yfinance as yf @@ -64,6 +64,11 @@ def add_technical_indicators(data): data['RSI'] = ta.momentum.rsi(data['Close'], window=14) data['MACD'] = ta.trend.macd_diff(data['Close']) data['BB_upper'], data['BB_middle'], data['BB_lower'] = ta.volatility.bollinger_hband_indicator(data['Close']), ta.volatility.bollinger_mavg(data['Close']), ta.volatility.bollinger_lband_indicator(data['Close']) + # Advanced indicators + data['EMA_20'] = ta.trend.ema_indicator(data['Close'], window=20) + data['ATR'] = ta.volatility.average_true_range(data['High'], data['Low'], data['Close']) + data['ADX'] = ta.trend.adx(data['High'], data['Low'], data['Close']) + data['Stoch_K'] = ta.momentum.stoch(data['High'], data['Low'], data['Close']) return data # Prepare data for model training by scaling and creating sequences @@ -84,8 +89,9 @@ def prepare_data(data, look_back=60): # Create an LSTM model for time series prediction def create_lstm_model(input_shape): model = Sequential([ - LSTM(units=50, return_sequences=True, input_shape=input_shape), - LSTM(units=50), + LSTM(units=64, return_sequences=True, input_shape=input_shape), + LSTM(units=32), + Dense(units=16, activation='relu'), Dense(units=1) ]) model.compile(optimizer='adam', loss='mean_squared_error') @@ -94,30 +100,26 @@ def create_lstm_model(input_shape): # Create a GRU model for time series prediction def create_gru_model(input_shape): model = Sequential([ - GRU(units=50, return_sequences=True, input_shape=input_shape), - GRU(units=50), + GRU(units=64, return_sequences=True, input_shape=input_shape), + GRU(units=32), + Dense(units=16, activation='relu'), Dense(units=1) ]) model.compile(optimizer='adam', loss='mean_squared_error') - return model + return model # Train and evaluate a model using time series cross-validation def train_and_evaluate_model(model, X, y, n_splits=5, model_name="Model"): - # Initialize time series cross-validation tscv = TimeSeriesSplit(n_splits=n_splits) scores = [] oof_predictions = np.zeros_like(y) - # Iterate through each fold with tqdm(total=n_splits, desc=f"Training {model_name}", leave=False) as pbar: for fold, (train_index, val_index) in enumerate(tscv.split(X), 1): - # Split data into training and validation sets X_train, X_val = X[train_index], X[val_index] y_train, y_val = y[train_index], y[val_index] - # Handle different model types (sklearn models vs Keras models) if isinstance(model, (RandomForestRegressor, XGBRegressor)): - # For sklearn models X_train_2d = X_train.reshape(X_train.shape[0], -1) X_val_2d = X_val.reshape(X_val.shape[0], -1) cloned_model = sklearn.base.clone(model) @@ -125,23 +127,25 @@ def train_and_evaluate_model(model, X, y, n_splits=5, model_name="Model"): val_pred = cloned_model.predict(X_val_2d) oof_predictions[val_index] = val_pred elif isinstance(model, Sequential): - # For Keras models (LSTM and GRU) cloned_model = keras_clone_model(model) cloned_model.compile(optimizer='adam', loss='mean_squared_error') + early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True) cloned_model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_val, y_val), verbose=0, - callbacks=[TqdmProgressCallback(100, f"{model_name} Epoch {fold}/{n_splits}")]) - val_pred = cloned_model.predict(X_val) - oof_predictions[val_index] = val_pred.flatten() + callbacks=[TqdmProgressCallback(100, f"{model_name} Epoch {fold}/{n_splits}"), early_stopping]) + + # Predict one step ahead for each sample in the validation set + val_pred = [] + for i in range(len(X_val)): + pred = cloned_model.predict(X_val[i:i+1]) + val_pred.append(pred[0][0]) + oof_predictions[val_index] = val_pred else: - # Raise error for unsupported model types raise ValueError(f"Unsupported model type: {type(model)}") - # Calculate and store the score for this fold score = r2_score(y_val, val_pred) scores.append(score) pbar.update(1) - # Calculate overall score and return results overall_score = r2_score(y, oof_predictions) return np.mean(scores), np.std(scores), overall_score, oof_predictions @@ -156,6 +160,16 @@ def ensemble_predict(models, X): predictions.append(pred.flatten()) # Flatten the predictions return np.mean(predictions, axis=0) +def weighted_ensemble_predict(models, X, weights): + predictions = [] + for model, weight in zip(models, weights): + if isinstance(model, (RandomForestRegressor, XGBRegressor)): + pred = model.predict(X.reshape(X.shape[0], -1)) + else: + pred = model.predict(X) + predictions.append(weight * pred.flatten()) + return np.sum(predictions, axis=0) + # Calculate risk metrics (Sharpe ratio and max drawdown) def calculate_risk_metrics(returns): sharpe_ratio = np.mean(returns) / np.std(returns) * np.sqrt(252) # Assuming daily returns @@ -251,6 +265,19 @@ def tune_xgboost(X, y, quick_test=False): print(f"Best XGBoost parameters: {xgb_random.best_params_}") return xgb_random.best_estimator_ +def implement_trading_strategy(actual_prices, predicted_prices, threshold=0.01): + returns = [] + position = 0 # -1: short, 0: neutral, 1: long + for i in range(1, len(actual_prices)): + predicted_return = (predicted_prices[i] - actual_prices[i-1]) / actual_prices[i-1] + if predicted_return > threshold and position <= 0: + position = 1 # Buy + elif predicted_return < -threshold and position >= 0: + position = -1 # Sell + actual_return = (actual_prices[i] - actual_prices[i-1]) / actual_prices[i-1] + returns.append(position * actual_return) + return np.array(returns) + # Main function to analyze stock data and make predictions def analyze_and_predict_stock(symbol, start_date, end_date, future_days=30, suppress_warnings=False, quick_test=False): # Suppress warnings if flag is set @@ -331,7 +358,8 @@ def analyze_and_predict_stock(symbol, start_date, end_date, future_days=30, supp print(tabulate(stats_df, headers='keys', tablefmt='pretty', floatfmt='.4f')) # Use out-of-fold predictions for ensemble - ensemble_predictions = np.mean([oof_predictions[name] for name in results.keys()], axis=0) + model_weights = [0.3, 0.3, 0.2, 0.2] # Adjust these based on model performance + ensemble_predictions = weighted_ensemble_predict([model for _, model in models], X, model_weights) # Predict future data future_predictions = [] @@ -354,11 +382,16 @@ def analyze_and_predict_stock(symbol, start_date, end_date, future_days=30, supp print(f"Sharpe Ratio: {sharpe_ratio:.4f}") print(f"Max Drawdown: {max_drawdown:.4f}") + # Implement trading strategy + strategy_returns = implement_trading_strategy(data['Close'].values[-len(ensemble_predictions):], ensemble_predictions.flatten()) + strategy_sharpe_ratio = np.mean(strategy_returns) / np.std(strategy_returns) * np.sqrt(252) + print(f"Trading Strategy Sharpe Ratio: {strategy_sharpe_ratio:.4f}") + # Plot results - plt.figure(figsize=(20, 16)) # Increased figure height + plt.figure(figsize=(20, 24)) # Increased figure height # Price prediction plot - plt.subplot(2, 1, 1) + plt.subplot(3, 1, 1) plot_data = data.iloc[-len(ensemble_predictions):] future_dates = pd.date_range(start=plot_data.index[-1] + pd.Timedelta(days=1), periods=future_days) @@ -372,7 +405,7 @@ def analyze_and_predict_stock(symbol, start_date, end_date, future_days=30, supp plt.legend() # Model performance summary table - plt.subplot(2, 1, 2) + plt.subplot(3, 1, 2) plt.axis('off') table = plt.table(cellText=stats_df.values, colLabels=stats_df.columns, @@ -385,9 +418,24 @@ def analyze_and_predict_stock(symbol, start_date, end_date, future_days=30, supp # Lower the title and add more space between plot and table plt.title('Model Performance Summary', pad=60) + # Calculate cumulative returns of the trading strategy + cumulative_returns = (1 + strategy_returns).cumprod() - 1 + + # Add new subplot for trading strategy performance + plt.subplot(3, 1, 3) + plt.plot(plot_data.index[-len(cumulative_returns):], cumulative_returns, label='Strategy Cumulative Returns', color='purple') + plt.title(f'{symbol} Trading Strategy Performance') + plt.xlabel('Date') + plt.ylabel('Cumulative Returns') + plt.legend() + + # Add strategy Sharpe ratio as text on the plot + plt.text(0.05, 0.95, f'Strategy Sharpe Ratio: {strategy_sharpe_ratio:.4f}', + transform=plt.gca().transAxes, verticalalignment='top') + plt.tight_layout() - plt.savefig(f'{symbol}_prediction_with_stats.png', dpi=300, bbox_inches='tight') - print(f"Plot with statistics saved as '{symbol}_prediction_with_stats.png'") + plt.savefig(f'{symbol}_prediction_with_stats_and_strategy.png', dpi=300, bbox_inches='tight') + print(f"Plot with statistics and strategy performance saved as '{symbol}_prediction_with_stats_and_strategy.png'") plt.show() print(f"\nFuture predictions for the next {future_days} days:")