# ============================================ # 1. Imports # ============================================ import numpy as np import pandas as pd import yfinance as yf from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score from sklearn.preprocessing import StandardScaler import matplotlib.pyplot as plt # ============================================ # 2. Parameters # ============================================ TICKERS = ["AAPL", "MSFT", "GOOGL", "AMZN", "META"] START_DATE = "2015-01-01" END_DATE = "2024-01-01" TRAIN_END = "2020-12-31" TRANSACTION_COST = 0.001 # 0.1% # ============================================ # 3. Download Data # ============================================ def download_data(tickers): data = yf.download(tickers, start=START_DATE, end=END_DATE, group_by="ticker") dfs = [] for ticker in tickers: df = data[ticker].copy() df["ticker"] = ticker dfs.append(df) df = pd.concat(dfs) df.index.name = "date" return df.reset_index() df = download_data(TICKERS) # ============================================ # 4. Sort (IMPORTANT) # ============================================ df = df.sort_values(["ticker", "date"]) # ============================================ # 5. Feature Engineering (NO APPLY) # ============================================ df["return_1d"] = df.groupby("ticker")["Close"].pct_change() df["return_5d"] = df.groupby("ticker")["Close"].pct_change(5) df["ma_5"] = df.groupby("ticker")["Close"].transform(lambda x: x.rolling(5).mean()) df["ma_10"] = df.groupby("ticker")["Close"].transform(lambda x: x.rolling(10).mean()) df["volatility_5d"] = ( df.groupby("ticker")["return_1d"] .transform(lambda x: x.rolling(5).std()) ) df["volume_change"] = df.groupby("ticker")["Volume"].pct_change() df["price_ma5_ratio"] = df["Close"] / df["ma_5"] # ============================================ # 6. Labels (SAFE) # ============================================ df["future_return"] = df.groupby("ticker")["Close"].pct_change().shift(-1) df["target"] = (df["future_return"] > 0).astype(int) # ============================================ # 7. Clean Data # ============================================ df = df.dropna().reset_index(drop=True) # ============================================ # 8. Train/Test Split # ============================================ train = df[df["date"] <= TRAIN_END] test = df[df["date"] > TRAIN_END] FEATURES = [ "return_1d", "return_5d", "ma_5", "ma_10", "volatility_5d", "volume_change", "price_ma5_ratio" ] X_train = train[FEATURES] y_train = train["target"] X_test = test[FEATURES] y_test = test["target"] # ============================================ # 9. Scaling # ============================================ scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) # ============================================ # 10. Train Model # ============================================ model = RandomForestClassifier( n_estimators=100, max_depth=5, random_state=42 ) model.fit(X_train, y_train) # ============================================ # 11. Predictions # ============================================ preds = model.predict(X_test) accuracy = accuracy_score(y_test, preds) print(f"Test Accuracy: {accuracy:.4f}") # ============================================ # 12. Backtest # ============================================ test = test.copy() test["prediction"] = preds # 🚨 Avoid lookahead bias test["prediction"] = test.groupby("ticker")["prediction"].shift(1) # Strategy returns test["strategy_return"] = test["future_return"] * test["prediction"] # Transaction costs test["position_change"] = ( test.groupby("ticker")["prediction"].diff().abs() ) test["transaction_cost"] = test["position_change"] * TRANSACTION_COST test["strategy_return"] -= test["transaction_cost"] # Drop NaNs from shifting test = test.dropna() # ============================================ # 13. Performance # ============================================ test["cum_market"] = (1 + test["future_return"]).cumprod() test["cum_strategy"] = (1 + test["strategy_return"]).cumprod() sharpe = np.sqrt(252) * test["strategy_return"].mean() / test["strategy_return"].std() print(f"Sharpe Ratio: {sharpe:.2f}") # ============================================ # 14. Plot # ============================================ plt.figure(figsize=(10,6)) plt.plot(test["date"], test["cum_market"], label="Market") plt.plot(test["date"], test["cum_strategy"], label="Strategy") plt.legend() plt.title("Strategy vs Market") plt.xlabel("Date") plt.ylabel("Cumulative Return") plt.grid() #plt.show() plt.savefig("strategy.png", dpi=150) print("Plot saved as strategy.png")