Files
random-forest-trader/main.py
2026-03-22 23:47:10 -04:00

170 lines
4.7 KiB
Python

# ============================================
# 1. Imports
# ============================================
import numpy as np
import pandas as pd
import yfinance as yf
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
# ============================================
# 2. Parameters
# ============================================
TICKERS = ["AAPL", "MSFT", "GOOGL", "AMZN", "META"]
START_DATE = "2015-01-01"
END_DATE = "2024-01-01"
TRAIN_END = "2020-12-31"
TRANSACTION_COST = 0.001 # 0.1%
# ============================================
# 3. Download Data
# ============================================
def download_data(tickers):
data = yf.download(tickers, start=START_DATE, end=END_DATE, group_by="ticker")
dfs = []
for ticker in tickers:
df = data[ticker].copy()
df["ticker"] = ticker
dfs.append(df)
df = pd.concat(dfs)
df.index.name = "date"
return df.reset_index()
df = download_data(TICKERS)
# ============================================
# 4. Sort (IMPORTANT)
# ============================================
df = df.sort_values(["ticker", "date"])
# ============================================
# 5. Feature Engineering (NO APPLY)
# ============================================
df["return_1d"] = df.groupby("ticker")["Close"].pct_change()
df["return_5d"] = df.groupby("ticker")["Close"].pct_change(5)
df["ma_5"] = df.groupby("ticker")["Close"].transform(lambda x: x.rolling(5).mean())
df["ma_10"] = df.groupby("ticker")["Close"].transform(lambda x: x.rolling(10).mean())
df["volatility_5d"] = (
df.groupby("ticker")["return_1d"]
.transform(lambda x: x.rolling(5).std())
)
df["volume_change"] = df.groupby("ticker")["Volume"].pct_change()
df["price_ma5_ratio"] = df["Close"] / df["ma_5"]
# ============================================
# 6. Labels (SAFE)
# ============================================
df["future_return"] = df.groupby("ticker")["Close"].pct_change().shift(-1)
df["target"] = (df["future_return"] > 0).astype(int)
# ============================================
# 7. Clean Data
# ============================================
df = df.dropna().reset_index(drop=True)
# ============================================
# 8. Train/Test Split
# ============================================
train = df[df["date"] <= TRAIN_END]
test = df[df["date"] > TRAIN_END]
FEATURES = [
"return_1d",
"return_5d",
"ma_5",
"ma_10",
"volatility_5d",
"volume_change",
"price_ma5_ratio"
]
X_train = train[FEATURES]
y_train = train["target"]
X_test = test[FEATURES]
y_test = test["target"]
# ============================================
# 9. Scaling
# ============================================
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# ============================================
# 10. Train Model
# ============================================
model = RandomForestClassifier(
n_estimators=100,
max_depth=5,
random_state=42
)
model.fit(X_train, y_train)
# ============================================
# 11. Predictions
# ============================================
preds = model.predict(X_test)
accuracy = accuracy_score(y_test, preds)
print(f"Test Accuracy: {accuracy:.4f}")
# ============================================
# 12. Backtest
# ============================================
test = test.copy()
test["prediction"] = preds
# 🚨 Avoid lookahead bias
test["prediction"] = test.groupby("ticker")["prediction"].shift(1)
# Strategy returns
test["strategy_return"] = test["future_return"] * test["prediction"]
# Transaction costs
test["position_change"] = (
test.groupby("ticker")["prediction"].diff().abs()
)
test["transaction_cost"] = test["position_change"] * TRANSACTION_COST
test["strategy_return"] -= test["transaction_cost"]
# Drop NaNs from shifting
test = test.dropna()
# ============================================
# 13. Performance
# ============================================
test["cum_market"] = (1 + test["future_return"]).cumprod()
test["cum_strategy"] = (1 + test["strategy_return"]).cumprod()
sharpe = np.sqrt(252) * test["strategy_return"].mean() / test["strategy_return"].std()
print(f"Sharpe Ratio: {sharpe:.2f}")
# ============================================
# 14. Plot
# ============================================
plt.figure(figsize=(10,6))
plt.plot(test["date"], test["cum_market"], label="Market")
plt.plot(test["date"], test["cum_strategy"], label="Strategy")
plt.legend()
plt.title("Strategy vs Market")
plt.xlabel("Date")
plt.ylabel("Cumulative Return")
plt.grid()
#plt.show()
plt.savefig("strategy.png", dpi=150)
print("Plot saved as strategy.png")