# Importing necessary libraries

import os
import re
import zipfile
from io import BytesIO, StringIO
from datetime import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import yfinance as yf
from scipy.stats import pearsonr
from IPython.display import display

plt.style.use("seaborn-v0_8")

USE_CACHED_DATA = True
ALLOW_DOWNLOADS = not USE_CACHED_DATA

os.makedirs("data/raw", exist_ok=True)
os.makedirs("docs/assets", exist_ok=True)

ff5_path = "data/raw/ff5_data.csv"
local_raw_path = "F-F_Research_Data_5_Factors_2x3_daily.csv"


def _parse_ff5_daily_lines(lines):
    rows = []
    for line in lines:
        compact = "".join(line.replace("\ufeff", "").split())
        if re.match(r"^\d{8},", compact):
            parts = compact.split(",")
            if len(parts) >= 7:
                rows.append(parts[:7])

    if len(rows) == 0:
        raise RuntimeError("Could not parse any FF5 daily rows from the raw Ken French CSV.")

    df = pd.DataFrame(rows, columns=["Date", "Mkt-RF", "SMB", "HML", "RMW", "CMA", "RF"])
    for c in ["Mkt-RF", "SMB", "HML", "RMW", "CMA", "RF"]:
        df[c] = pd.to_numeric(df[c], errors="coerce")

    df = df.dropna(subset=["Mkt-RF", "SMB", "HML", "RMW", "CMA", "RF"])
    return df


if not os.path.exists(ff5_path):
    if not ALLOW_DOWNLOADS:
        raise RuntimeError(
            "FF5 data is missing and downloads are disabled. "
            "Either set USE_CACHED_DATA = False or provide data/raw/ff5_data.csv"
        )

    if os.path.exists(local_raw_path):
        with open(local_raw_path, "r", encoding="latin-1") as f:
            raw_lines = f.read().splitlines()
        ff5_df = _parse_ff5_daily_lines(raw_lines)
    else:
        ff5_url = "https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp/F-F_Research_Data_5_Factors_2x3_daily_CSV.zip"
        resp = requests.get(ff5_url, timeout=60, headers={"User-Agent": "Mozilla/5.0"})
        resp.raise_for_status()

        zf = zipfile.ZipFile(BytesIO(resp.content))
        csv_names = [n for n in zf.namelist() if n.lower().endswith(".csv")]
        if len(csv_names) == 0:
            raise RuntimeError("Ken French ZIP did not contain a CSV file.")

        daily_candidates = [n for n in csv_names if "daily" in n.lower()]
        csv_name = daily_candidates[0] if len(daily_candidates) > 0 else csv_names[0]

        raw_text = zf.read(csv_name).decode("latin-1")
        ff5_df = _parse_ff5_daily_lines(raw_text.splitlines())

    ff5_df.to_csv(ff5_path, index=False)

print("FF5 data ready:", ff5_path)

FF5 data ready: data/raw/ff5_data.csv

ff5_data = pd.read_csv("data/raw/ff5_data.csv")

# Convert dates, coercing errors to NaT (Not a Time)
ff5_data['Date'] = pd.to_datetime(ff5_data['Date'], format='%Y%m%d', errors='coerce')

# Remove rows where Date conversion failed
ff5_data = ff5_data.dropna(subset=['Date'])

# Display summary statistics for FF5 factors
print("Summary Statistics for Fama-French Five Factors:")
display(ff5_data[['Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA', 'RF']].describe())

Summary Statistics for Fama-French Five Factors:

# Set Date as index for better plotting
ff5_data.set_index('Date', inplace=True)

# Plot each feature in subplots
features = ['Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA', 'RF']
colors = ['blue', 'green', 'red', 'purple', 'orange', 'brown']

fig, axes = plt.subplots(6, 1, figsize=(12, 15))
fig.suptitle('Fama-French Factors Over Time', fontsize=16)

for i, (feature, color) in enumerate(zip(features, colors)):
    axes[i].plot(ff5_data.index, ff5_data[feature], color=color, linewidth=0.8)
    axes[i].set_title(f'{feature}', fontsize=12)
    axes[i].set_ylabel('Value')
    axes[i].grid(True, alpha=0.3)
    axes[i].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

sp500_csv_path = "data/raw/sp500.csv"

if os.path.exists(sp500_csv_path):
    sp500_df = pd.read_csv(sp500_csv_path)
    if "Symbol" not in sp500_df.columns:
        raise RuntimeError("data/raw/sp500.csv is missing a 'Symbol' column.")
    sp500_df["Symbol"] = sp500_df["Symbol"].astype(str).str.replace(".", "-", regex=False)
    sp500_tickers = sp500_df["Symbol"].tolist()
else:
    if not ALLOW_DOWNLOADS:
        raise RuntimeError(
            "S&P 500 constituents file is missing and downloads are disabled. "
            "Either set USE_CACHED_DATA = False or provide data/raw/sp500.csv"
        )

    url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }

    response = requests.get(url, headers=headers, timeout=60)
    response.raise_for_status()

    sp500_table = pd.read_html(StringIO(response.text))
    sp500_df = sp500_table[0]

    sp500_df["Symbol"] = sp500_df["Symbol"].astype(str).str.replace(".", "-", regex=False)
    sp500_df.to_csv(sp500_csv_path, index=False)

    sp500_tickers = sp500_df["Symbol"].tolist()

print("S&P 500 Companies:")
print(sp500_df.head())
print(f"\nTotal companies: {len(sp500_df)}")

S&P 500 Companies:
  Symbol             Security             GICS Sector  \
0    MMM                   3M             Industrials   
1    AOS          A. O. Smith             Industrials   
2    ABT  Abbott Laboratories             Health Care   
3   ABBV               AbbVie             Health Care   
4    ACN            Accenture  Information Technology   

                GICS Sub-Industry    Headquarters Location  Date added  \
0        Industrial Conglomerates    Saint Paul, Minnesota  1957-03-04   
1               Building Products     Milwaukee, Wisconsin  2017-07-26   
2           Health Care Equipment  North Chicago, Illinois  1957-03-04   
3                   Biotechnology  North Chicago, Illinois  2012-12-31   
4  IT Consulting & Other Services          Dublin, Ireland  2011-07-06   

       CIK      Founded  
0    66740         1902  
1    91142         1916  
2     1800         1888  
3  1551152  2013 (1888)  
4  1467373         1989  

Total companies: 503

sp500_df.to_csv("data/raw/sp500.csv", index=False)

market_csv_path = "data/raw/market_data.csv"
spy_csv_path = "data/raw/spy_monthly_returns.csv"

if os.path.exists(market_csv_path) and os.path.exists(spy_csv_path):
    monthly_returns = pd.read_csv(market_csv_path, parse_dates=["Date"]).set_index("Date")
    spy_monthly_returns = pd.read_csv(spy_csv_path, parse_dates=["Date"]).set_index("Date")["SPY"]

    monthly_returns.index = monthly_returns.index.to_period("M").to_timestamp()
    spy_monthly_returns.index = spy_monthly_returns.index.to_period("M").to_timestamp()
else:
    if not ALLOW_DOWNLOADS:
        raise RuntimeError(
            "Market/benchmark return files are missing and downloads are disabled. "
            "Either set USE_CACHED_DATA = False or provide data/raw/market_data.csv and data/raw/spy_monthly_returns.csv"
        )

    # Download monthly data with proper parameters
    start_date = "2005-01-01"
    end_date = datetime.now().strftime("%Y-%m-%d")

    print("DOWNLOADING S&P 500 DATA")

    # Download SPY benchmark data - monthly
    print("\n[1/2] Downloading SPY (Benchmark)...")
    spy_data = yf.download(
        tickers="SPY",
        start=start_date,
        end=end_date,
        interval="1mo",
        progress=True,
        auto_adjust=True,
    )

    # Download all S&P 500 stocks - monthly
    print("\n[2/2] Downloading all S&P 500 stocks...")
    price_data = yf.download(
        tickers=sp500_tickers,
        start=start_date,
        end=end_date,
        interval="1mo",
        threads=True,
        progress=True,
        auto_adjust=True,
        group_by="column",
    )

    # Calculate monthly returns (with auto_adjust=True, use 'Close' not 'Adj Close')
    monthly_returns = price_data["Close"].pct_change(fill_method=None)
    spy_monthly_returns = spy_data["Close"].pct_change(fill_method=None)

    # Standardize monthly timestamps to month-start and drop the current (incomplete) month if present
    monthly_returns.index = monthly_returns.index.to_period("M").to_timestamp()
    spy_monthly_returns.index = spy_monthly_returns.index.to_period("M").to_timestamp()

    current_month = pd.Timestamp.today().to_period("M")
    if len(monthly_returns) > 0 and monthly_returns.index.max().to_period("M") == current_month:
        monthly_returns = monthly_returns.iloc[:-1]
    if len(spy_monthly_returns) > 0 and spy_monthly_returns.index.max().to_period("M") == current_month:
        spy_monthly_returns = spy_monthly_returns.iloc[:-1]

    print("DOWNLOAD SUMMARY")
    print(f" Price data shape: {price_data.shape}")
    print(f"Monthly returns shape: {monthly_returns.shape}")
    print(f"Date range: {monthly_returns.index[0].strftime('%Y-%m-%d')} to {monthly_returns.index[-1].strftime('%Y-%m-%d')}")
    print(f"Total months: {len(monthly_returns)}")
    print(f"Successfully downloaded: {monthly_returns.shape[1]} stocks")

monthly_returns

spy_monthly_returns

Date
2005-02-01    0.020904
2005-03-01   -0.022134
2005-04-01   -0.014880
2005-05-01    0.032224
2005-06-01   -0.002511
                ...   
2025-07-01    0.026056
2025-08-01    0.020520
2025-09-01    0.032757
2025-10-01    0.026676
2025-11-01    0.001950
Freq: MS, Name: SPY, Length: 250, dtype: float64

market_csv_path = "data/raw/market_data.csv"
spy_csv_path = "data/raw/spy_monthly_returns.csv"

if not (os.path.exists(market_csv_path) and os.path.exists(spy_csv_path)):
    monthly_returns_to_save = monthly_returns.dropna(how="all")
    monthly_returns_to_save.index.name = "Date"
    monthly_returns_to_save.to_csv(market_csv_path, index=True)

    # Ensure the benchmark CSV has a consistent column name: SPY
    if isinstance(spy_monthly_returns, pd.Series):
        spy_to_save = spy_monthly_returns.rename("SPY").to_frame()
    else:
        spy_to_save = spy_monthly_returns.copy()
        if list(spy_to_save.columns) != ["SPY"] and spy_to_save.shape[1] == 1:
            spy_to_save.columns = ["SPY"]

    # Drop the first NaN return from pct_change()
    spy_to_save = spy_to_save.dropna()
    spy_to_save.index.name = "Date"
    spy_to_save.to_csv(spy_csv_path, index=True)

print("Saved/using cached files:")
print(" -", market_csv_path)
print(" -", spy_csv_path)

Saved/using cached files:
 - data/raw/market_data.csv
 - data/raw/spy_monthly_returns.csv

# import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr

ff5_data = pd.read_csv('data/raw/ff5_data.csv')
ff5_data['Date'] = pd.to_datetime(ff5_data['Date'].astype(str), format='%Y%m%d', errors='coerce')
ff5_data = ff5_data.dropna(subset=['Date']).set_index('Date').sort_index()
ff5_data = ff5_data / 100.0

spy_returns = pd.read_csv('data/raw/spy_monthly_returns.csv', index_col=0)
spy_returns.index = pd.to_datetime(spy_returns.index, errors='coerce')
spy_returns = spy_returns[spy_returns.index.notna()]
spy_returns = spy_returns.rename(columns={'SPY': 'SPY_returns'})

sp500_returns = pd.read_csv('data/raw/market_data.csv', index_col=0)
sp500_returns.index = pd.to_datetime(sp500_returns.index, errors='coerce')
sp500_returns = sp500_returns[sp500_returns.index.notna()]

print("Data loaded successfully!")
print(f"FF5 data: {ff5_data.shape}")
print(f"SPY returns: {spy_returns.shape}")
print(f"SP500 returns: {sp500_returns.shape}")

Data loaded successfully!
FF5 data: (15690, 6)
SPY returns: (250, 1)
SP500 returns: (250, 503)

# limit data to understandable range
ff5_recent = ff5_data.loc['2005-01-01':]
print(f"Data range: {ff5_recent.index[0]} to {ff5_recent.index[-1]}")
print(f"Shape: {ff5_recent.shape}")
print("\nFirst 10 rows:")
print(ff5_recent.head(10))

Data range: 2005-01-03 00:00:00 to 2025-10-31 00:00:00
Shape: (5242, 6)

First 10 rows:
            Mkt-RF     SMB     HML     RMW     CMA      RF
Date                                                      
2005-01-03 -0.0097 -0.0063 -0.0005  0.0035 -0.0001  0.0001
2005-01-04 -0.0130 -0.0051  0.0044  0.0083 -0.0048  0.0001
2005-01-05 -0.0051 -0.0114  0.0002  0.0004 -0.0013  0.0001
2005-01-06  0.0034 -0.0003  0.0013  0.0052 -0.0012  0.0001
2005-01-07 -0.0022 -0.0084 -0.0010 -0.0015 -0.0002  0.0001
2005-01-10  0.0042  0.0029  0.0012  0.0000 -0.0023  0.0001
2005-01-11 -0.0068 -0.0029  0.0037  0.0088 -0.0017  0.0001
2005-01-12  0.0038 -0.0009 -0.0009 -0.0022  0.0012  0.0001
2005-01-13 -0.0077  0.0024  0.0039  0.0005  0.0001  0.0001
2005-01-14  0.0066  0.0050  0.0004 -0.0019  0.0015  0.0001

# 3.2 Main characteristics of data
print("SUMMARY STATISTICS")
print(ff5_recent.describe())

print("\nDATA TYPES")
print(ff5_recent.dtypes)

print("\nMISSING VALUES")
missing = ff5_recent.isnull().sum()
print(missing[missing > 0] if missing.sum() > 0 else "No missing values")

# Drop rows with missing values
ff5_recent = ff5_recent.dropna()

# Convert index to datetime if not already
ff5_recent.index = pd.to_datetime(ff5_recent.index)

# Convert daily factor data to monthly data.
# RF is a daily simple rate; we compound it to get monthly RF.
rf_monthly = (1 + ff5_recent['RF']).resample('M').prod() - 1

# Mkt-RF is an excess return. To aggregate correctly, build the market total return first,
# then subtract the compounded monthly RF.
mkt_total_daily = ff5_recent['Mkt-RF'] + ff5_recent['RF']
mkt_total_monthly = (1 + mkt_total_daily).resample('M').prod() - 1
mkt_rf_monthly = mkt_total_monthly - rf_monthly

# Style factors are long/short portfolio returns; we compound within the month.
other_factors = ['SMB', 'HML', 'RMW', 'CMA']
other_monthly = (1 + ff5_recent[other_factors]).resample('M').prod() - 1

ff5_monthly = pd.concat(
    [mkt_rf_monthly.rename('Mkt-RF'), other_monthly, rf_monthly.rename('RF')],
    axis=1
)

# Align to month-start timestamps (matches SPY monthly return index)
ff5_monthly.index = ff5_monthly.index.to_period('M').to_timestamp()

SUMMARY STATISTICS
            Mkt-RF          SMB          HML          RMW          CMA  \
count  5242.000000  5242.000000  5242.000000  5242.000000  5242.000000   
mean      0.000421    -0.000024    -0.000034     0.000125    -0.000008   
std       0.012247     0.006396     0.007982     0.004586     0.003946   
min      -0.120100    -0.045800    -0.050300    -0.027100    -0.029200   
25%      -0.004300    -0.003700    -0.003500    -0.002400    -0.002000   
50%       0.000700    -0.000100    -0.000200     0.000100    -0.000100   
75%       0.006000     0.003500     0.003200     0.002500     0.001900   
max       0.113600     0.057100     0.067300     0.042500     0.024700   

                RF  
count  5242.000000  
mean      0.000067  
std       0.000083  
min       0.000000  
25%       0.000000  
50%       0.000000  
75%       0.000100  
max       0.000200  

DATA TYPES
Mkt-RF    float64
SMB       float64
HML       float64
RMW       float64
CMA       float64
RF        float64
dtype: object

MISSING VALUES
No missing values

ff5_monthly

fig, axes = plt.subplots(6, 1, figsize=(14, 18))
fig.suptitle('Fama-French 5 Factors: Historical Performance & Key Events', 
             fontsize=16, fontweight='bold')

factors = ['Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA', 'RF']
colors = ['black', 'blue', 'green', 'red', 'purple', 'orange']

# Key historical events to annotate
events = {    
    '2008-09': 'Global Financial Crisis',
    '2020-03': 'COVID-19 Crash',
    '2020-11': 'Vaccine Announcement',
    '2022-01': 'Fed Rate Hikes Begin',    
}

for i, (factor, color) in enumerate(zip(factors, colors)):
    # Plot the time series
    axes[i].plot(ff5_monthly.index, ff5_monthly[factor], color=color, linewidth=1.5, marker='o', markersize=2)
    axes[i].set_title(f'{factor}', fontsize=12, fontweight='bold')
    
    # Better y-axis label based on what FF data actually is
    if factor == 'RF':
        axes[i].set_ylabel('Risk-Free Rate', fontsize=10)
    else:
        axes[i].set_ylabel('Monthly Return', fontsize=10)
    
    axes[i].grid(True, alpha=0.3)
    axes[i].axhline(y=0, color='gray', linestyle='--', linewidth=0.8)
    
    # Add event annotations
    for date, event in events.items():
        event_date = pd.to_datetime(date)
        if event_date >= ff5_monthly.index.min() and event_date <= ff5_monthly.index.max():
            axes[i].axvline(x=event_date, color='red', 
                          linestyle=':', alpha=0.5, linewidth=1)
            # Add text label only on first subplot
            if i == 0:
                axes[i].text(event_date, axes[i].get_ylim()[1]*0.9, event, 
                           rotation=90, verticalalignment='top', fontsize=8)

plt.tight_layout()
plt.show()

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
fig.suptitle('distribution analysis: understanding factor patterns', fontsize=16)

axes = axes.flatten()
for i, factor in enumerate(factors):
    # Simple histogram
    axes[i].hist(ff5_monthly[factor], bins=20, alpha=0.7, color=colors[i], edgecolor='black')
    
    # Add mean line
    mean_val = ff5_monthly[factor].mean()
    axes[i].axvline(mean_val, color='red', linestyle='--', linewidth=2, label=f'Mean: {mean_val:.2f}')
    
    # Add median line
    median_val = ff5_monthly[factor].median()
    axes[i].axvline(median_val, color='green', linestyle='--', linewidth=2, label=f'Median: {median_val:.2f}')
    
    axes[i].set_title(f'{factor}', fontsize=12, fontweight='bold')
    axes[i].set_xlabel('Return Value', fontsize=10)
    axes[i].set_ylabel('Frequency (Count)', fontsize=10)
    axes[i].legend(fontsize=8)
    axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# statistics
print("STATISTICS FOR EACH FACTOR")
for factor in factors:
    print(f"{factor}:")
    print(f"  Mean (Average):     {ff5_monthly[factor].mean():.4f}")
    print(f"  Median (Middle):    {ff5_monthly[factor].median():.4f}")
    print(f"  Std Dev (Spread):   {ff5_monthly[factor].std():.4f}")
    print()

# check for symmetry 
print("IS THE DATA BALANCED?")
for factor in factors:
    mean_val = ff5_monthly[factor].mean()
    median_val = ff5_monthly[factor].median()
    std_dev = ff5_monthly[factor].std()
    
    if abs(mean_val - median_val) < 0.1 * std_dev:
        balance = " Balanced (mean ≈ median)"
    elif mean_val > median_val:
        balance = " Right-skewed (more extreme positive values)"
    else:
        balance = " Left-skewed (more extreme negative values)"
    
    print(f"{factor}: {balance}")

STATISTICS FOR EACH FACTOR
Mkt-RF:
  Mean (Average):     0.0083
  Median (Middle):    0.0135
  Std Dev (Spread):   0.0444

SMB:
  Mean (Average):     -0.0006
  Median (Middle):    -0.0009
  Std Dev (Spread):   0.0266

HML:
  Mean (Average):     -0.0008
  Median (Middle):    -0.0033
  Std Dev (Spread):   0.0323

RMW:
  Mean (Average):     0.0026
  Median (Middle):    0.0026
  Std Dev (Spread):   0.0190

CMA:
  Mean (Average):     -0.0002
  Median (Middle):    -0.0010
  Std Dev (Spread):   0.0192

RF:
  Mean (Average):     0.0014
  Median (Middle):    0.0000
  Std Dev (Spread):   0.0017

IS THE DATA BALANCED?
Mkt-RF:  Left-skewed (more extreme negative values)
SMB:  Balanced (mean ≈ median)
HML:  Balanced (mean ≈ median)
RMW:  Balanced (mean ≈ median)
CMA:  Balanced (mean ≈ median)
RF:  Right-skewed (more extreme positive values)

ff5_monthly[['Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA', 'RF']].describe()

# box plots to see spread and outliers
fig, axes = plt.subplots(1, 6, figsize=(18, 5))
fig.suptitle('box plots: spread and outliers', fontsize=16)

for i, factor in enumerate(factors):
    axes[i].boxplot(ff5_monthly[factor], patch_artist=True,
                   boxprops=dict(facecolor=colors[i], alpha=0.7),
                   medianprops=dict(color='red', linewidth=2))
    axes[i].set_title(factor, fontsize=12)
    axes[i].set_ylabel('Return Value', fontsize=10)
    axes[i].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

# Outlier detection using IQR rule
outlier_counts = {}

print("NUMBER OF OUTLIERS USING IQR RULE")
for factor in factors:
    q1 = ff5_monthly[factor].quantile(0.25)
    q3 = ff5_monthly[factor].quantile(0.75)
    iqr = q3 - q1
    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr

    mask = (ff5_monthly[factor] < lower) | (ff5_monthly[factor] > upper)
    count = mask.sum()
    outlier_counts[factor] = count

    print(f"{factor}: {count} outliers "
          f"(lower={lower:.4f}, upper={upper:.4f})")

NUMBER OF OUTLIERS USING IQR RULE
Mkt-RF: 7 outliers (lower=-0.0949, upper=0.1111)
SMB: 7 outliers (lower=-0.0678, upper=0.0638)
HML: 18 outliers (lower=-0.0646, upper=0.0612)
RMW: 11 outliers (lower=-0.0381, upper=0.0426)
CMA: 8 outliers (lower=-0.0465, upper=0.0446)
RF: 0 outliers (lower=-0.0035, upper=0.0058)

combined = pd.DataFrame(spy_returns)
combined = combined.join(ff5_monthly[['Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA', 'RF']], how='inner')

# Use SPY excess returns to match the definition of Mkt-RF (market excess return)
combined['SPY_excess'] = combined['SPY_returns'] - combined['RF']

print("Correlation ranges from -1 to +1:")
print("  +1 = Perfect positive relationship (move together)")
print("   0 = No relationship")
print("  -1 = Perfect negative relationship (move opposite)\n")

correlations = combined.corr()['SPY_excess'].drop(['SPY_returns', 'SPY_excess']).sort_values(ascending=False)
print(correlations)

plt.figure(figsize=(10, 8))
cols = ['SPY_excess', 'Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA', 'RF']
sns.heatmap(combined[cols].corr(), annot=True, cmap='coolwarm', center=0,
            fmt='.3f', square=True, linewidths=1, cbar_kws={'label': 'Correlation'})
plt.title('Correlation Matrix: SPY Excess Returns vs FF5 Factors (Monthly)', fontsize=14)
plt.tight_layout()
plt.show()

strongest = correlations.abs().idxmax()
print(f"Strongest relationship: {strongest} (correlation = {correlations[strongest]:.3f})")
print("This factor moves most closely with SPY excess returns")

weakest = correlations.abs().idxmin()
print(f"\nWeakest relationship: {weakest} (correlation = {correlations[weakest]:.3f})")
print("This factor has little connection to SPY excess returns")

Correlation ranges from -1 to +1:
  +1 = Perfect positive relationship (move together)
   0 = No relationship
  -1 = Perfect negative relationship (move opposite)

Mkt-RF    0.990466
SMB       0.304479
HML       0.138649
RF       -0.057112
CMA      -0.113793
RMW      -0.147610
Name: SPY_excess, dtype: float64

Strongest relationship: Mkt-RF (correlation = 0.990)
This factor moves most closely with SPY excess returns

Weakest relationship: RF (correlation = -0.057)
This factor has little connection to SPY excess returns

valid = combined[['SPY_excess', 'Mkt-RF']].dropna()

r, p_value = pearsonr(valid['SPY_excess'], valid['Mkt-RF'])

print(f"Correlation coefficient (r): {r:.3f}")
print(f"p-value: {p_value:.4g}")

Correlation coefficient (r): 0.990
p-value: 1.172e-214

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display

plt.style.use("seaborn-v0_8")

# Ensure docs assets folder exists for GitHub Pages figures
os.makedirs("docs/assets", exist_ok=True)

# XGBoost (Model 2 learning-to-rank)
try:
    from xgboost import XGBRanker
    _HAS_XGBOOST = True
except Exception:
    XGBRanker = None
    _HAS_XGBOOST = False

ff = pd.read_csv("data/raw/ff5_data.csv")
ff = ff[ff["Date"].astype(str).str.isdigit()].copy()
ff["date"] = pd.to_datetime(ff["Date"].astype(str), format="%Y%m%d", errors="coerce")
ff = ff.dropna(subset=["date"]).drop(columns=["Date"]).sort_values("date").reset_index(drop=True)

factor_cols = ["Mkt-RF", "SMB", "HML", "RMW", "CMA", "RF"]
ff[factor_cols] = ff[factor_cols].ffill()
ff[factor_cols] = ff[factor_cols] / 100.0

ff = ff.set_index("date")[factor_cols].sort_index()

# Monthly RF by compounding daily RF
rf_monthly = (1 + ff["RF"]).resample("M").prod() - 1

# Monthly Mkt-RF computed from compounded market total return minus compounded RF
mkt_total_daily = ff["Mkt-RF"] + ff["RF"]
mkt_total_monthly = (1 + mkt_total_daily).resample("M").prod() - 1
mkt_rf_monthly = mkt_total_monthly - rf_monthly

# Style factors compounded within the month
other_factors = ["SMB", "HML", "RMW", "CMA"]
other_monthly = (1 + ff[other_factors]).resample("M").prod() - 1

ff_monthly = pd.concat(
    [mkt_rf_monthly.rename("Mkt-RF"), other_monthly, rf_monthly.rename("RF")],
    axis=1,
)

# Align to month-start timestamps (matches market data index)
ff_monthly.index = ff_monthly.index.to_period("M").to_timestamp()
ff_monthly = ff_monthly.reset_index().rename(columns={"date": "date"})

ff_monthly.head()

market = pd.read_csv("data/raw/market_data.csv", parse_dates=["Date"])
market = market.rename(columns={"Date": "date"})
market = market.sort_values("date").reset_index(drop=True)

market.head()

df = market.merge(ff_monthly, on="date", how="inner")

if "RF_y" in df.columns:
    df = df.rename(columns={"RF_y": "RF"})
if "RF_x" in df.columns:
    df = df.drop(columns=["RF_x"])

factor_cols = ["Mkt-RF", "SMB", "HML", "RMW", "CMA", "RF"]
tickers = [c for c in df.columns if c not in ["date"] + factor_cols]

df.head()

df_total = df.copy()

df_excess = df.copy()
df_excess[tickers] = df_excess[tickers].sub(df_excess["RF"], axis=0)

def ols_numpy(X, y):
    """Perform closed-form OLS regression."""
    beta, *_ = np.linalg.lstsq(X, y, rcond=None)
    return beta

def rolling_factor_loadings(df, ticker, window=36, min_obs=24):
    betas = []
    dates = []

    y_all = df[ticker].astype(float).values
    X_all = df[["Mkt-RF", "SMB", "HML", "RMW", "CMA"]].astype(float).values
    X_all = np.column_stack([np.ones(len(X_all)), X_all])

    for i in range(window, len(df)):
        y = y_all[i - window:i]
        X = X_all[i - window:i]

        mask = np.isfinite(y) & np.all(np.isfinite(X), axis=1)
        if mask.sum() < min_obs:
            continue

        beta = ols_numpy(X[mask], y[mask])
        betas.append(beta)
        dates.append(df["date"].iloc[i])

    if len(betas) == 0:
        return pd.DataFrame(columns=["alpha", "MKT", "SMB", "HML", "RMW", "CMA"])

    return pd.DataFrame(
        betas,
        index=dates,
        columns=["alpha", "MKT", "SMB", "HML", "RMW", "CMA"],
    )

len(df)

249

beta_dict = {}

for t in tickers:
    print("Running:", t)
    try:
        beta_dict[t] = rolling_factor_loadings(df_excess, t)
    except Exception as e:
        print(f"Skipping {t}: {e}")

Running: A
Running: AAPL
Running: ABBV
Running: ABNB
Running: ABT
Running: ACGL
Running: ACN
Running: ADBE
Running: ADI
Running: ADM
Running: ADP
Running: ADSK
Running: AEE
Running: AEP
Running: AES
Running: AFL
Running: AIG
Running: AIZ
Running: AJG
Running: AKAM
Running: ALB
Running: ALGN
Running: ALL
Running: ALLE
Running: AMAT
Running: AMCR
Running: AMD
Running: AME
Running: AMGN
Running: AMP
Running: AMT
Running: AMZN
Running: ANET
Running: AON
Running: AOS
Running: APA
Running: APD
Running: APH
Running: APO
Running: APP
Running: APTV
Running: ARE
Running: ARES
Running: ATO
Running: AVB
Running: AVGO
Running: AVY
Running: AWK
Running: AXON
Running: AXP
Running: AZO
Running: BA
Running: BAC
Running: BALL
Running: BAX
Running: BBY
Running: BDX
Running: BEN
Running: BF-B
Running: BG
Running: BIIB
Running: BK
Running: BKNG
Running: BKR
Running: BLDR
Running: BLK
Running: BMY
Running: BR
Running: BRK-B
Running: BRO
Running: BSX
Running: BX
Running: BXP
Running: C
Running: CAG
Running: CAH
Running: CARR
Running: CAT
Running: CB
Running: CBOE
Running: CBRE
Running: CCI
Running: CCL
Running: CDNS
Running: CDW
Running: CEG
Running: CF
Running: CFG
Running: CHD
Running: CHRW
Running: CHTR
Running: CI
Running: CINF
Running: CL
Running: CLX
Running: CMCSA
Running: CME
Running: CMG
Running: CMI
Running: CMS
Running: CNC
Running: CNP
Running: COF
Running: COIN
Running: COO
Running: COP
Running: COR
Running: COST
Running: CPAY
Running: CPB
Running: CPRT
Running: CPT
Running: CRL
Running: CRM
Running: CRWD
Running: CSCO
Running: CSGP
Running: CSX
Running: CTAS
Running: CTRA
Running: CTSH
Running: CTVA
Running: CVS
Running: CVX
Running: D
Running: DAL
Running: DASH
Running: DAY
Running: DD
Running: DDOG
Running: DE
Running: DECK
Running: DELL
Running: DG
Running: DGX
Running: DHI
Running: DHR
Running: DIS
Running: DLR
Running: DLTR
Running: DOC
Running: DOV
Running: DOW
Running: DPZ
Running: DRI
Running: DTE
Running: DUK
Running: DVA
Running: DVN
Running: DXCM
Running: EA
Running: EBAY
Running: ECL

Running: ED
Running: EFX
Running: EG
Running: EIX
Running: EL
Running: ELV
Running: EME
Running: EMR
Running: EOG
Running: EPAM
Running: EQIX
Running: EQR
Running: EQT
Running: ERIE
Running: ES
Running: ESS
Running: ETN
Running: ETR
Running: EVRG
Running: EW
Running: EXC
Running: EXE
Running: EXPD
Running: EXPE
Running: EXR
Running: F
Running: FANG
Running: FAST
Running: FCX
Running: FDS
Running: FDX
Running: FE
Running: FFIV
Running: FICO
Running: FIS
Running: FISV
Running: FITB
Running: FOX
Running: FOXA
Running: FRT
Running: FSLR
Running: FTNT
Running: FTV
Running: GD
Running: GDDY
Running: GE
Running: GEHC
Running: GEN
Running: GEV
Running: GILD
Running: GIS
Running: GL
Running: GLW
Running: GM
Running: GNRC
Running: GOOG
Running: GOOGL
Running: GPC
Running: GPN
Running: GRMN
Running: GS
Running: GWW
Running: HAL
Running: HAS
Running: HBAN
Running: HCA
Running: HD
Running: HIG
Running: HII
Running: HLT
Running: HOLX
Running: HON
Running: HOOD
Running: HPE
Running: HPQ
Running: HRL
Running: HSIC
Running: HST
Running: HSY
Running: HUBB
Running: HUM
Running: HWM
Running: IBKR
Running: IBM
Running: ICE
Running: IDXX
Running: IEX
Running: IFF
Running: INCY
Running: INTC
Running: INTU
Running: INVH
Running: IP
Running: IQV
Running: IR
Running: IRM
Running: ISRG
Running: IT
Running: ITW
Running: IVZ
Running: J
Running: JBHT
Running: JBL
Running: JCI
Running: JKHY
Running: JNJ
Running: JPM
Running: KDP
Running: KEY
Running: KEYS
Running: KHC
Running: KIM
Running: KKR
Running: KLAC
Running: KMB
Running: KMI
Running: KO
Running: KR
Running: KVUE
Running: L
Running: LDOS
Running: LEN
Running: LH
Running: LHX
Running: LII
Running: LIN
Running: LKQ
Running: LLY
Running: LMT
Running: LNT
Running: LOW
Running: LRCX
Running: LULU
Running: LUV
Running: LVS
Running: LW
Running: LYB
Running: LYV
Running: MA
Running: MAA
Running: MAR
Running: MAS
Running: MCD
Running: MCHP
Running: MCK
Running: MCO
Running: MDLZ
Running: MDT
Running: MET
Running: META
Running: MGM
Running: MHK
Running: MKC
Running: MLM
Running: MMC
Running: MMM
Running: MNST
Running: MO
Running: MOH
Running: MOS
Running: MPC
Running: MPWR
Running: MRK
Running: MRNA
Running: MS
Running: MSCI
Running: MSFT
Running: MSI
Running: MTB
Running: MTCH
Running: MTD
Running: MU
Running: NCLH
Running: NDAQ
Running: NDSN
Running: NEE
Running: NEM
Running: NFLX
Running: NI
Running: NKE
Running: NOC
Running: NOW
Running: NRG
Running: NSC
Running: NTAP
Running: NTRS
Running: NUE
Running: NVDA
Running: NVR
Running: NWS
Running: NWSA
Running: NXPI
Running: O
Running: ODFL
Running: OKE
Running: OMC
Running: ON
Running: ORCL
Running: ORLY
Running: OTIS
Running: OXY
Running: PANW
Running: PAYC
Running: PAYX
Running: PCAR
Running: PCG
Running: PEG
Running: PEP
Running: PFE
Running: PFG
Running: PG
Running: PGR
Running: PH
Running: PHM
Running: PKG
Running: PLD
Running: PLTR
Running: PM
Running: PNC
Running: PNR
Running: PNW
Running: PODD
Running: POOL
Running: PPG
Running: PPL
Running: PRU
Running: PSA
Running: PSKY
Running: PSX
Running: PTC
Running: PWR
Running: PYPL
Running: Q
Running: QCOM
Running: RCL
Running: REG
Running: REGN
Running: RJF
Running: RL
Running: RMD
Running: ROK
Running: ROL
Running: ROP
Running: ROST
Running: RSG
Running: RTX
Running: RVTY
Running: SBAC
Running: SBUX
Running: SCHW
Running: SHW
Running: SJM
Running: SLB
Running: SMCI
Running: SNA
Running: SNDK
Running: SNPS
Running: SO
Running: SOLS
Running: SOLV
Running: SPG
Running: SPGI
Running: SRE
Running: STE
Running: STLD
Running: STT
Running: STX
Running: STZ
Running: SW
Running: SWK
Running: SWKS
Running: SYF
Running: SYK
Running: SYY
Running: T
Running: TAP
Running: TDG
Running: TDY
Running: TECH
Running: TEL
Running: TER
Running: TFC
Running: TGT
Running: TJX
Running: TKO
Running: TMO
Running: TMUS
Running: TPL
Running: TPR
Running: TRGP
Running: TRMB
Running: TROW
Running: TRV
Running: TSCO
Running: TSLA
Running: TSN
Running: TT
Running: TTD
Running: TTWO
Running: TXN
Running: TXT
Running: TYL
Running: UAL
Running: UBER
Running: UDR
Running: UHS
Running: ULTA
Running: UNH
Running: UNP
Running: UPS
Running: URI
Running: USB
Running: V
Running: VICI
Running: VLO
Running: VLTO
Running: VMC
Running: VRSK
Running: VRSN
Running: VRTX
Running: VST
Running: VTR
Running: VTRS
Running: VZ
Running: WAB
Running: WAT
Running: WBD
Running: WDAY
Running: WDC
Running: WEC
Running: WELL
Running: WFC
Running: WM
Running: WMB
Running: WMT
Running: WRB
Running: WSM
Running: WST
Running: WTW
Running: WY
Running: WYNN
Running: XEL
Running: XOM
Running: XYL
Running: XYZ
Running: YUM
Running: ZBH
Running: ZBRA
Running: ZTS

factor_hist = df.set_index("date")[["Mkt-RF", "SMB", "HML", "RMW", "CMA"]]

# Forecast factors using only information available before month t (avoid look-ahead)
factor_forecast = factor_hist.rolling(window=36, min_periods=24).mean().shift(1)

predictions = {}

for t in tickers:
    betas = beta_dict.get(t)
    if betas is None or betas.empty:
        continue

    idx = betas.index.intersection(factor_forecast.index)
    if len(idx) == 0:
        continue

    # Use factor exposures only (exclude intercept/alpha to reduce overfitting risk)
    beta_factors = betas.loc[idx, ["MKT", "SMB", "HML", "RMW", "CMA"]].values
    f_mat = factor_forecast.loc[idx, ["Mkt-RF", "SMB", "HML", "RMW", "CMA"]].values

    pred = (beta_factors * f_mat).sum(axis=1)
    predictions[t] = pd.Series(pred, index=idx)

pred_df = pd.DataFrame(predictions)
pred_df.head()

def build_long_only_portfolio(pred_df, k=50):
    weights = pd.DataFrame(0.0, index=pred_df.index, columns=pred_df.columns)

    for dt, row in pred_df.iterrows():
        row = row.dropna()
        if row.empty:
            continue

        top_k = row.nlargest(min(k, len(row))).index
        w = 1.0 / len(top_k)
        weights.loc[dt, top_k] = w

    return weights

weights = build_long_only_portfolio(pred_df)
weights.head()

# Realized (same-month) total returns for each stock
realized_returns = df_total.set_index("date")[tickers]

common_index = weights.index.intersection(realized_returns.index)
weights_aligned = weights.loc[common_index]
realized_aligned = realized_returns.loc[common_index]

available = realized_aligned.notna()
w_eff = weights_aligned.where(available, 0.0)
denom = w_eff.sum(axis=1).replace(0.0, np.nan)

# Effective portfolio weights actually used after dropping missing returns
weights_used = w_eff.div(denom, axis=0)

portfolio_returns = (w_eff * realized_aligned.fillna(0.0)).sum(axis=1) / denom
active = denom.notna()

portfolio_returns = portfolio_returns.loc[active]
weights_aligned = weights_aligned.loc[active]
weights_used = weights_used.loc[active]

portfolio_returns.head()

2008-02-01   -0.011657
2008-03-01    0.007785
2008-04-01    0.095576
2008-05-01    0.036239
2008-06-01   -0.065236
Freq: MS, dtype: float64

benchmark = pd.read_csv("data/raw/spy_monthly_returns.csv", parse_dates=["Date"])
benchmark = benchmark.rename(columns={"Date": "date", "SPY": "return"})
benchmark = benchmark.set_index("date")["return"]

benchmark_ret = benchmark.reindex(portfolio_returns.index)
benchmark_ret.head()

2008-02-01   -0.025842
2008-03-01   -0.013825
2008-04-01    0.052849
2008-05-01    0.015117
2008-06-01   -0.088137
Freq: MS, Name: return, dtype: float64

def sharpe_ratio(r):
    r = pd.Series(r).dropna()
    if r.std() == 0 or len(r) == 0:
        return np.nan
    return (r.mean() / r.std()) * np.sqrt(12)

def annualized_vol(r):
    r = pd.Series(r).dropna()
    if len(r) == 0:
        return np.nan
    return r.std() * np.sqrt(12)

def cagr(r):
    r = pd.Series(r).dropna()
    if len(r) == 0:
        return np.nan
    growth = (1 + r).prod()
    years = len(r) / 12
    if years <= 0:
        return np.nan
    return growth ** (1 / years) - 1

def max_drawdown(cum_curve):
    cum_curve = pd.Series(cum_curve).dropna()
    if len(cum_curve) == 0:
        return np.nan
    running_max = cum_curve.cummax()
    dd = cum_curve / running_max - 1
    return dd.min()

# Portfolio diagnostics
holdings_count = (weights_used > 0).sum(axis=1)
turnover = 0.5 * weights_used.diff().abs().sum(axis=1).fillna(0.0)

# Transaction costs are DISABLED by default for this project.
# If you want to enable them later, set cost_bps to a positive value (e.g., 10 or 25).
cost_bps = 0
cost_rate = cost_bps / 10000.0
portfolio_returns_net = portfolio_returns - cost_rate * turnover

# Align RF to portfolio dates for Sharpe and excess-return metrics
rf_series = df_total.set_index("date")["RF"].reindex(portfolio_returns.index)

portfolio_excess = portfolio_returns - rf_series
portfolio_excess_net = portfolio_returns_net - rf_series
benchmark_excess = benchmark_ret - rf_series

# Active return (strategy vs SPY)
active_gross = portfolio_returns - benchmark_ret
active_net = portfolio_returns_net - benchmark_ret

# Equity curves
cum_port_gross = (1 + portfolio_returns).cumprod()
cum_port_net = (1 + portfolio_returns_net).cumprod()
cum_bench = (1 + benchmark_ret.fillna(0.0)).cumprod()

# Drawdowns
dd_port_gross = cum_port_gross / cum_port_gross.cummax() - 1
dd_port_net = cum_port_net / cum_port_net.cummax() - 1
dd_bench = cum_bench / cum_bench.cummax() - 1

# Summary metrics
print("Backtest window:")
print(f"  Start: {portfolio_returns.index.min().date()}")
print(f"  End:   {portfolio_returns.index.max().date()}")
print(f"  Months: {len(portfolio_returns)}")

print("\nHoldings diagnostics:")
print(f"  Avg holdings/month: {holdings_count.mean():.1f}")
print(f"  Min holdings/month: {holdings_count.min()}")
print(f"  Max holdings/month: {holdings_count.max()}")

print("\nTurnover diagnostics:")
print(f"  Avg monthly turnover: {turnover.mean():.3f}")
print(f"  Median monthly turnover: {turnover.median():.3f}")
print(f"  90th pct turnover: {turnover.quantile(0.90):.3f}")

print("\nPerformance (Excess-return metrics for Sharpe):")
print("  Strategy Sharpe (Excess):", sharpe_ratio(portfolio_excess))
print("  Benchmark (SPY) Sharpe (Excess):", sharpe_ratio(benchmark_excess))

print("\nPerformance (Total-return metrics):")
print("  Strategy CAGR:", cagr(portfolio_returns))
print("  SPY CAGR:", cagr(benchmark_ret))

print("  Strategy Vol (ann.):", annualized_vol(portfolio_returns))
print("  SPY Vol (ann.):", annualized_vol(benchmark_ret))

mdd_gross = max_drawdown(cum_port_gross)
mdd_bench = max_drawdown(cum_bench)

print("  Strategy Max Drawdown:", mdd_gross)
print("  SPY Max Drawdown:", mdd_bench)

calmar_gross = cagr(portfolio_returns) / abs(mdd_gross) if pd.notna(mdd_gross) and mdd_gross != 0 else np.nan
calmar_bench = cagr(benchmark_ret) / abs(mdd_bench) if pd.notna(mdd_bench) and mdd_bench != 0 else np.nan

print("  Strategy Calmar:", calmar_gross)
print("  SPY Calmar:", calmar_bench)

print("\nRelative to SPY (Information Ratio on active returns):")
print("  Gross IR:", sharpe_ratio(active_gross))

# Plots
fig = plt.figure(figsize=(12, 5))
plt.plot(cum_port_gross, label="Strategy")
plt.plot(cum_bench, label="SPY")
plt.legend()
plt.title("Cumulative Returns: Strategy vs SPY")
fig.savefig("docs/assets/strategy_cumulative_returns.png", dpi=200, bbox_inches="tight")
plt.show()

fig = plt.figure(figsize=(12, 4))
plt.plot(dd_port_gross, label="Strategy DD")
plt.plot(dd_bench, label="SPY DD")
plt.axhline(0, color="black", linewidth=0.8)
plt.legend()
plt.title("Drawdowns: Strategy vs SPY")
fig.savefig("docs/assets/strategy_drawdowns.png", dpi=200, bbox_inches="tight")
plt.show()

# Rolling 12-month total returns
roll_12m_strategy = (1 + portfolio_returns).rolling(12).apply(np.prod, raw=True) - 1
roll_12m_spy = (1 + benchmark_ret).rolling(12).apply(np.prod, raw=True) - 1

fig = plt.figure(figsize=(12, 4))
plt.plot(roll_12m_strategy, label="Strategy 12M")
plt.plot(roll_12m_spy, label="SPY 12M")
plt.axhline(0, color="black", linewidth=0.8)
plt.legend()
plt.title("Rolling 12-Month Returns")
fig.savefig("docs/assets/strategy_rolling_12m_returns.png", dpi=200, bbox_inches="tight")
plt.show()

Backtest window:
  Start: 2008-02-01
  End:   2025-10-01
  Months: 213

Holdings diagnostics:
  Avg holdings/month: 50.0
  Min holdings/month: 50
  Max holdings/month: 50

Turnover diagnostics:
  Avg monthly turnover: 0.194
  Median monthly turnover: 0.180
  90th pct turnover: 0.320

Performance (Excess-return metrics for Sharpe):
  Strategy Sharpe (Excess): 0.7723862656307524
  Benchmark (SPY) Sharpe (Excess): 0.6859562149485776

Performance (Total-return metrics):
  Strategy CAGR: 0.16337270876902377
  SPY CAGR: 0.11508279637774854
  Strategy Vol (ann.): 0.20845302629748985
  SPY Vol (ann.): 0.1591727803696657
  Strategy Max Drawdown: -0.5204733094313239
  SPY Max Drawdown: -0.46322417130504834
  Strategy Calmar: 0.3138925777914087
  SPY Calmar: 0.24843866859003508

Relative to SPY (Information Ratio on active returns):
  Gross IR: 0.5982336277620093

try:
    import sklearn
    _HAS_SKLEARN = True
except Exception:
    _HAS_SKLEARN = False

if not _HAS_XGBOOST:
    _RUN_XGB = False
    print("XGBoost is not installed. To run Model 2, install it with: pip install xgboost==2.0.3")
elif not _HAS_SKLEARN:
    _RUN_XGB = False
    print("scikit-learn is required for Model 2 (XGBRanker). Install it with: pip install scikit-learn==1.5.2")
else:
    _RUN_XGB = True

xgb_k = 50
xgb_n_bins = 5
xgb_min_train_months = 36
xgb_train_window = 60
xgb_retrain_every = 3 # train quarterly to reduce runtime, set to 1 to retrain monthly

xgb_model_params = dict(
    objective="rank:pairwise",
    n_estimators=120,
    learning_rate=0.05,
    max_depth=3,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    random_state=42,
    n_jobs=-1,
    tree_method="hist",
    verbosity=0,
)

panel_xgb = None
xgb_feature_cols = None
weights_xgb = None
portfolio_returns_xgb = None
portfolio_returns_xgb_net = None
turnover_xgb = None

# Build the feature/label panel used by the ML models

if _RUN_XGB:
    realized = df_total.set_index("date")[tickers]

    def _stack_to_long(df_wide, value_name):
        s = df_wide.stack(dropna=False).rename(value_name)
        s.index = s.index.set_names(["date", "ticker"])
        return s.reset_index()

    # Betas in long form (betas at date t are estimated using months < t)
    beta_long_parts = []
    for tkr, bdf in beta_dict.items():
        if bdf is None or bdf.empty:
            continue
        tmp = bdf[["MKT", "SMB", "HML", "RMW", "CMA"]].copy()
        tmp["ticker"] = tkr
        tmp["date"] = pd.to_datetime(tmp.index)
        beta_long_parts.append(tmp.reset_index(drop=True))

    beta_long = pd.concat(beta_long_parts, ignore_index=True)

    # Label: excess return in month t (y - RF)
    rf = df_total.set_index("date")["RF"]
    y_excess_wide = realized.sub(rf, axis=0)
    y_excess_long = _stack_to_long(y_excess_wide, "y_excess")

    # Shared panel (one row per date, ticker)
    panel_ml = beta_long.merge(y_excess_long, on=["date", "ticker"], how="left")

    xgb_feature_cols = ["MKT", "SMB", "HML", "RMW", "CMA"]

    panel_ml = panel_ml.replace([np.inf, -np.inf], np.nan)
    panel_ml = panel_ml.dropna(subset=xgb_feature_cols + ["y_excess"]).sort_values(["date", "ticker"]).reset_index(drop=True)

    # Classification label (Linear model): 1 if excess return > 0 else 0
    panel_ml["y_cls"] = (panel_ml["y_excess"] > 0).astype(int)

    # Ranking label (XGBRanker): reserve rel=0 for non-positive excess, bin positives into 1 to (B-1)
    panel_xgb = panel_ml.copy()
    B = xgb_n_bins
    is_pos = panel_xgb["y_excess"] > 0
    panel_xgb["rel"] = 0

    pct = panel_xgb.loc[is_pos].groupby("date")["y_excess"].rank(pct=True, method="first")
    panel_xgb.loc[is_pos, "rel"] = (1 + np.floor(pct * (B - 1)).clip(0, B - 2)).astype(int)

    print("panel_ml shape:", panel_ml.shape)
    print("panel_ml months:", panel_ml["date"].nunique())
    print("panel_ml tickers (avg per month):", panel_ml.groupby("date").size().mean())
    panel_ml.head()
else:
    print("Skipping panel build (XGBoost / scikit-learn not available).")

panel_ml shape: (96718, 9)
panel_ml months: 213
panel_ml tickers (avg per month): 454.07511737089203

# Walk-forward Linear classifier (Logistic Regression) using ONLY the same 5 beta features

try:
    from sklearn.linear_model import LogisticRegression
    _HAS_LOGREG = True
except Exception:
    LogisticRegression = None
    _HAS_LOGREG = False

lin_k = 50
lin_min_train_months = 36
lin_train_window = 60
lin_retrain_every = 3

weights_lin = None
portfolio_returns_lin = None

if _RUN_XGB and _HAS_LOGREG and (panel_ml is not None):
    unique_dates = np.array(sorted(panel_ml["date"].unique()))

    lin_selected = {}
    last_model = None
    last_train_end = None

    for i in range(len(unique_dates)):
        dt = unique_dates[i]

        train_end = i  # strictly before dt
        train_start = max(0, train_end - lin_train_window)
        train_dates = unique_dates[train_start:train_end]

        if len(train_dates) < lin_min_train_months:
            continue

        train_df = panel_ml[panel_ml["date"].isin(train_dates)].copy().sort_values(["date", "ticker"])
        test_df = panel_ml[panel_ml["date"] == dt].copy().sort_values(["date", "ticker"])

        if len(test_df) == 0:
            continue

        need_retrain = (
            last_model is None
            or last_train_end is None
            or lin_retrain_every == 1
            or ((train_end - last_train_end) >= lin_retrain_every)
        )

        if need_retrain:
            X_train = train_df[xgb_feature_cols].values
            y_train = train_df["y_cls"].values

            clf = LogisticRegression(
                penalty="l2",
                C=1.0,
                solver="lbfgs",
                max_iter=1000,
            )
            clf.fit(X_train, y_train)
            last_model = clf
            last_train_end = train_end

        X_test = test_df[xgb_feature_cols].values
        p_good = last_model.predict_proba(X_test)[:, 1]  # P(y_excess > 0)

        test_df = test_df.assign(p_good=p_good)
        top = test_df.nlargest(min(lin_k, len(test_df)), "p_good")["ticker"].tolist()
        lin_selected[pd.to_datetime(dt)] = top

    # Convert selections to equal weights
    lin_dates = sorted(lin_selected.keys())
    weights_lin = pd.DataFrame(0.0, index=pd.to_datetime(lin_dates), columns=tickers)

    for dt, names in lin_selected.items():
        if len(names) == 0:
            continue
        weights_lin.loc[dt, names] = 1.0 / len(names)

    # Compute realized portfolio returns which is same method as baseline
    realized_returns_lin = df_total.set_index("date")[tickers]
    common_index = weights_lin.index.intersection(realized_returns_lin.index)

    weights_lin = weights_lin.loc[common_index]
    realized_lin = realized_returns_lin.loc[common_index]

    available = realized_lin.notna()
    w_eff = weights_lin.where(available, 0.0)
    denom = w_eff.sum(axis=1).replace(0.0, np.nan)

    portfolio_returns_lin = (w_eff * realized_lin.fillna(0.0)).sum(axis=1) / denom
    portfolio_returns_lin = portfolio_returns_lin.loc[denom.notna()]

    print("Linear months:", len(portfolio_returns_lin))
    print("Linear start:", portfolio_returns_lin.index.min().date())
    print("Linear end:  ", portfolio_returns_lin.index.max().date())
else:
    print("Skipping linear classifier (requires scikit-learn + panel_ml).")

Linear months: 177
Linear start: 2011-02-01
Linear end:   2025-10-01

# Walk-forward training and portfolio construction

if _RUN_XGB:
    unique_dates = np.array(sorted(panel_xgb["date"].unique()))

    xgb_selected = {}
    last_model = None
    last_train_end = None

    for i in range(len(unique_dates)):
        dt = unique_dates[i]

        train_end = i  # strictly before dt
        train_start = max(0, train_end - xgb_train_window)
        train_dates = unique_dates[train_start:train_end]

        if len(train_dates) < xgb_min_train_months:
            continue

        train_df = panel_xgb[panel_xgb["date"].isin(train_dates)].copy().sort_values(["date", "ticker"])
        test_df = panel_xgb[panel_xgb["date"] == dt].copy().sort_values(["date", "ticker"])

        if len(test_df) == 0:
            continue

        need_retrain = (
            last_model is None
            or last_train_end is None
            or xgb_retrain_every == 1
            or ((train_end - last_train_end) >= xgb_retrain_every)
        )

        if need_retrain:
            grp = train_df.groupby("date").size().values
            X_train = train_df[xgb_feature_cols].values
            y_train = train_df["rel"].values

            ranker = XGBRanker(
                **xgb_model_params,
                eval_metric=f"ndcg@{xgb_k}",
            )

            ranker.fit(X_train, y_train, group=grp)
            last_model = ranker
            last_train_end = train_end

        X_test = test_df[xgb_feature_cols].values
        scores = last_model.predict(X_test)

        test_df = test_df.assign(score=scores)
        top = test_df.nlargest(min(xgb_k, len(test_df)), "score")["ticker"].tolist()
        xgb_selected[pd.to_datetime(dt)] = top

    # Convert selections to equal weights
    xgb_dates = sorted(xgb_selected.keys())
    weights_xgb = pd.DataFrame(0.0, index=pd.to_datetime(xgb_dates), columns=tickers)

    for dt, names in xgb_selected.items():
        if len(names) == 0:
            continue
        weights_xgb.loc[dt, names] = 1.0 / len(names)

    # Compute realized portfolio returns (same method as baseline)
    realized_returns_xgb = df_total.set_index("date")[tickers]
    common_index = weights_xgb.index.intersection(realized_returns_xgb.index)

    weights_xgb = weights_xgb.loc[common_index]
    realized_xgb = realized_returns_xgb.loc[common_index]

    available = realized_xgb.notna()
    w_eff = weights_xgb.where(available, 0.0)
    denom = w_eff.sum(axis=1).replace(0.0, np.nan)

    weights_used_xgb = w_eff.div(denom, axis=0)
    portfolio_returns_xgb = (w_eff * realized_xgb.fillna(0.0)).sum(axis=1) / denom

    active = denom.notna()
    portfolio_returns_xgb = portfolio_returns_xgb.loc[active]
    weights_used_xgb = weights_used_xgb.loc[active]

    turnover_xgb = 0.5 * weights_used_xgb.diff().abs().sum(axis=1).fillna(0.0)
    portfolio_returns_xgb_net = portfolio_returns_xgb - cost_rate * turnover_xgb

    print("XGB months:", len(portfolio_returns_xgb))
    print("XGB start:", portfolio_returns_xgb.index.min().date())
    print("XGB end:  ", portfolio_returns_xgb.index.max().date())
else:
    print("Skipping walk-forward training/backtest (XGBoost / scikit-learn not available).")

XGB months: 177
XGB start: 2011-02-01
XGB end:   2025-10-01

# Comparison report (aligned to XGB window)

if _RUN_XGB and portfolio_returns_xgb is not None and len(portfolio_returns_xgb) > 0:
    aligned_idx = portfolio_returns_xgb.index

    bench_xgb = benchmark.reindex(aligned_idx)
    rf_xgb = df_total.set_index("date")["RF"].reindex(aligned_idx)

    # XGB excess
    xgb_excess = portfolio_returns_xgb - rf_xgb

    # SPY excess
    bench_excess_xgb = bench_xgb - rf_xgb

    print("MODEL COMPARISON (ALIGNED TO XGB WINDOW)")

    if portfolio_returns_lin is not None and len(portfolio_returns_lin.dropna()) > 0:
        lin_al = portfolio_returns_lin.reindex(aligned_idx)
        lin_excess = lin_al - rf_xgb

        print("\nSharpe (Excess Returns):")
        print("  Linear: ", sharpe_ratio(lin_excess))
        print("  XGB:    ", sharpe_ratio(xgb_excess))
        print("  SPY:    ", sharpe_ratio(bench_excess_xgb))

        print("\nCAGR (Total Returns):")
        print("  Linear: ", cagr(lin_al))
        print("  XGB:    ", cagr(portfolio_returns_xgb))
        print("  SPY:    ", cagr(bench_xgb))

        cum_lin = (1 + lin_al.fillna(0.0)).cumprod()
        cum_xgb = (1 + portfolio_returns_xgb.fillna(0.0)).cumprod()
        cum_spy = (1 + bench_xgb.fillna(0.0)).cumprod()

        fig = plt.figure(figsize=(12, 5))
        plt.plot(cum_lin, label="Linear")
        plt.plot(cum_xgb, label="XGB Ranker")
        plt.plot(cum_spy, label="SPY")
        plt.title("Cumulative Returns: Linear vs XGBoost Ranker vs SPY")
        plt.legend()
        fig.savefig("docs/assets/strategy_cumulative_returns_xgb_compare.png", dpi=200, bbox_inches="tight")
        plt.show()
    else:
        print("\nLinear backtest not available; showing XGB vs SPY only.")

        print("\nSharpe (Excess Returns):")
        print("  XGB: ", sharpe_ratio(xgb_excess))
        print("  SPY: ", sharpe_ratio(bench_excess_xgb))

        print("\nCAGR (Total Returns):")
        print("  XGB: ", cagr(portfolio_returns_xgb))
        print("  SPY: ", cagr(bench_xgb))

        cum_xgb = (1 + portfolio_returns_xgb.fillna(0.0)).cumprod()
        cum_spy = (1 + bench_xgb.fillna(0.0)).cumprod()

        fig = plt.figure(figsize=(12, 5))
        plt.plot(cum_xgb, label="XGB Ranker")
        plt.plot(cum_spy, label="SPY")
        plt.title("Cumulative Returns: XGBoost Ranker vs SPY")
        plt.legend()
        fig.savefig("docs/assets/strategy_cumulative_returns_xgb_compare.png", dpi=200, bbox_inches="tight")
        plt.show()
else:
    print("Skipping comparison (XGBoost backtest not available).")

MODEL COMPARISON (ALIGNED TO XGB WINDOW)

Sharpe (Excess Returns):
  Linear:  0.9688939966316122
  XGB:     1.0846525889251435
  SPY:     0.884080424706205

CAGR (Total Returns):
  Linear:  0.18710281373240267
  XGB:     0.25056139695864954
  SPY:     0.1400042145054483

# All-model comparison tables + simple plots (aligned window)

def _perf_summary_table(model_returns, rf):
    rows = []
    for name, r in model_returns.items():
        r = pd.Series(r).dropna()
        if len(r) == 0:
            continue

        rf_al = rf.reindex(r.index)
        ex = r - rf_al

        cum = (1 + r).cumprod()
        mdd = max_drawdown(cum)
        c = cagr(r)
        vol = annualized_vol(r)
        shrp = sharpe_ratio(ex)
        calmar = c / abs(mdd) if pd.notna(mdd) and mdd != 0 else np.nan

        rows.append(
            {
                "Model": name,
                "Start": r.index.min().date(),
                "End": r.index.max().date(),
                "Months": len(r),
                "Sharpe (Excess)": shrp,
                "CAGR": c,
                "Vol (ann.)": vol,
                "Max DD": mdd,
                "Calmar": calmar,
            }
        )

    out = pd.DataFrame(rows).set_index("Model")
    return out


def _calendar_year_returns(r):
    r = pd.Series(r).dropna()
    if len(r) == 0:
        return pd.Series(dtype=float)
    by_year = (1 + r).groupby(r.index.year).prod() - 1
    return by_year


# Determine common window for comparison (SPY vs Linear vs XGB)
model_series = {
    "SPY": benchmark_ret,
}

if portfolio_returns_lin is not None:
    model_series["Linear"] = portfolio_returns_lin
if portfolio_returns_xgb is not None:
    model_series["XGB"] = portfolio_returns_xgb

common_idx = None
for s in model_series.values():
    idx = pd.Series(s).dropna().index
    common_idx = idx if common_idx is None else common_idx.intersection(idx)

common_idx = common_idx.sort_values()

if len(common_idx) == 0:
    print("No overlapping dates across models; cannot compare.")
else:
    aligned_models = {k: pd.Series(v).reindex(common_idx) for k, v in model_series.items()}
    rf_common = df_total.set_index("date")["RF"].reindex(common_idx)

    # Summary table
    summary = _perf_summary_table(aligned_models, rf_common)
    summary_fmt = summary.copy()

    for col in ["Sharpe (Excess)", "CAGR", "Vol (ann.)", "Max DD", "Calmar"]:
        if col in ["Sharpe (Excess)", "Calmar"]:
            summary_fmt[col] = summary_fmt[col].map(lambda x: f"{x:.3f}" if pd.notna(x) else "")
        else:
            summary_fmt[col] = summary_fmt[col].map(lambda x: f"{x:.2%}" if pd.notna(x) else "")

    print("MODEL SUMMARY")
    print(f"Common start: {common_idx.min().date()}  |  Common end: {common_idx.max().date()}  |  Months: {len(common_idx)}")
    display(summary_fmt)

    yr = pd.DataFrame({name: _calendar_year_returns(r) for name, r in aligned_models.items()}).sort_index()
    yr_fmt = yr.applymap(lambda x: f"{x:.2%}" if pd.notna(x) else "")

    print("\nCalendar-year returns (aligned window):")
    display(yr_fmt)

    # Simple comparison plots (3 total)
    plot_df = pd.DataFrame(aligned_models).fillna(0.0)

    # Cumulative returns
    cum = (1 + plot_df).cumprod()
    fig = plt.figure(figsize=(12, 5))
    for col in cum.columns:
        plt.plot(cum.index, cum[col], label=col)
    plt.title("Cumulative Returns (Aligned): SPY vs Linear vs XGB")
    plt.legend()
    fig.savefig("docs/assets/compare_cumulative.png", dpi=200, bbox_inches="tight")
    plt.show()

    # Drawdowns
    dd = cum / cum.cummax() - 1
    fig = plt.figure(figsize=(12, 4))
    for col in dd.columns:
        plt.plot(dd.index, dd[col], label=col)
    plt.axhline(0, color="black", linewidth=0.8)
    plt.title("Drawdowns (Aligned): SPY vs Linear vs XGB")
    plt.legend()
    fig.savefig("docs/assets/compare_drawdowns.png", dpi=200, bbox_inches="tight")
    plt.show()

    # Rolling 12-month returns
    roll_12m = (1 + plot_df).rolling(12).apply(np.prod, raw=True) - 1
    fig = plt.figure(figsize=(12, 4))
    for col in roll_12m.columns:
        plt.plot(roll_12m.index, roll_12m[col], label=col)
    plt.axhline(0, color="black", linewidth=0.8)
    plt.title("Rolling 12-Month Returns (Aligned): SPY vs Linear vs XGB")
    plt.legend()
    fig.savefig("docs/assets/compare_rolling_12m.png", dpi=200, bbox_inches="tight")
    plt.show()

MODEL SUMMARY
Common start: 2011-02-01  |  Common end: 2025-10-01  |  Months: 177

Calendar-year returns (aligned window):

/tmp/ipykernel_59430/2470478243.py:84: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead.
  yr_fmt = yr.applymap(lambda x: f"{x:.2%}" if pd.notna(x) else "")

# Simple model summary box (SPY vs Linear vs XGB)

# Ensure full column widths are displayed
pd.set_option("display.max_colwidth", None)

def _max_dd_from_returns(r):
    r = pd.Series(r).dropna()
    if len(r) == 0:
        return np.nan
    cum = (1 + r).cumprod()
    return (cum / cum.cummax() - 1).min()


def _summary_row(name, r, rf=None, features="", objective="", selection=""):
    r = pd.Series(r).dropna()
    if len(r) == 0:
        return None

    rf_al = rf.reindex(r.index) if rf is not None else pd.Series(index=r.index, data=0.0)
    ex = r - rf_al

    return {
        "Model": name,
        "Features": features,
        "Objective/Label": objective,
        "Portfolio Rule": selection,
        "Start": r.index.min().date(),
        "End": r.index.max().date(),
        "Months": len(r),
        "Sharpe": sharpe_ratio(ex),
        "CAGR": cagr(r),
        "Max DD": _max_dd_from_returns(r),
    }


rf_series = df_total.set_index("date")["RF"]

rows = []

# SPY benchmark
rows.append(
    _summary_row(
        name="SPY (Benchmark)",
        r=benchmark_ret,
        rf=rf_series,
        features="N/A",
        objective="Passive benchmark (S&P 500 ETF)",
        selection="Buy & Hold",
    )
)

# Linear classifier (Logistic Regression)
if portfolio_returns_lin is not None:
    rows.append(
        _summary_row(
            name="Linear (LogReg)",
            r=portfolio_returns_lin,
            rf=rf_series,
            features="5 rolling FF5 betas: MKT, SMB, HML, RMW, CMA",
            objective="Binary classification: P(y_excess > 0) via Logistic Regression",
            selection="Top-50 by predicted probability, equal weight",
        )
    )

# XGBoost ranker
if portfolio_returns_xgb is not None:
    rows.append(
        _summary_row(
            name="XGBRanker (pairwise)",
            r=portfolio_returns_xgb,
            rf=rf_series,
            features="5 rolling FF5 betas: MKT, SMB, HML, RMW, CMA",
            objective="Pairwise ranking: rel bins from y_excess; bin 0 for y_excess ≤ 0",
            selection="Top-50 by ranking score, equal weight",
        )
    )

summary_box = pd.DataFrame([r for r in rows if r is not None]).set_index("Model")

# Pretty formatting for display
summary_fmt = summary_box.copy()
summary_fmt["Sharpe"] = summary_fmt["Sharpe"].map(lambda x: f"{x:.3f}" if pd.notna(x) else "")
summary_fmt["CAGR"] = summary_fmt["CAGR"].map(lambda x: f"{x:.2%}" if pd.notna(x) else "")
summary_fmt["Max DD"] = summary_fmt["Max DD"].map(lambda x: f"{x:.2%}" if pd.notna(x) else "")

display(summary_fmt)

# XGBoost used to recommend tickers for the latest available month

infer_k = 50

if not (_RUN_XGB and _HAS_XGBOOST) or panel_xgb is None or xgb_feature_cols is None:
    raise ValueError("XGBoost not enabled or panel_xgb/xgb_feature_cols missing. Run the XGB section first.")

latest_dt = pd.to_datetime(sorted(panel_xgb["date"].unique())[-1])
train_dates = sorted([d for d in panel_xgb["date"].unique() if pd.to_datetime(d) < latest_dt])

print("Inference month:", latest_dt.date())
print("Train months:", len(train_dates), "| Train end:", pd.to_datetime(train_dates[-1]).date())

train_x = panel_xgb[panel_xgb["date"].isin(train_dates)].copy().sort_values(["date", "ticker"])
test_x  = panel_xgb[panel_xgb["date"] == latest_dt].copy().sort_values(["date", "ticker"])

group_train = train_x.groupby("date").size().to_numpy()

xgb_model = XGBRanker(**xgb_model_params)
xgb_model.fit(
    train_x[xgb_feature_cols].values,
    train_x["rel"].values,
    group=group_train,
)

scores = xgb_model.predict(test_x[xgb_feature_cols].values)
xgb_rank = test_x.assign(score=scores).sort_values("score", ascending=False)

xgb_top = xgb_rank.head(min(infer_k, len(xgb_rank)))[["ticker", "score"]].reset_index(drop=True)

print("\nTop tickers to invest in (XGBRanker score):")
display(xgb_top)

Inference month: 2025-10-01
Train months: 212 | Train end: 2025-09-01

Top tickers to invest in (XGBRanker score):

Factor	Positive Means	Negative Means
Mkt-RF	Stocks beat bonds	Bonds beat stocks
SMB	Small caps win	Large caps win
HML	Value stocks win	Growth stocks win
RMW	Profitable firms win	Unprofitable firms win
CMA	Conservative wins	Aggressive wins
RF	Higher safe returns	Lower safe returns

	Mkt-RF	SMB	HML	RMW	CMA	RF
count	15690.000000	15690.000000	15690.000000	15690.000000	15690.000000	15690.000000
mean	0.028841	0.005815	0.014004	0.012736	0.011643	0.017183
std	1.023264	0.551013	0.585571	0.403702	0.382175	0.012740
min	-17.440000	-11.150000	-5.030000	-2.970000	-5.310000	0.000000
25%	-0.420000	-0.280000	-0.240000	-0.180000	-0.180000	0.010000
50%	0.050000	0.020000	0.010000	0.010000	0.010000	0.020000
75%	0.510000	0.310000	0.260000	0.190000	0.200000	0.020000
max	11.360000	6.080000	6.730000	4.570000	2.480000	0.060000

	A	AAPL	ABBV	ABNB	ABT	ACGL	ACN	ADBE	ADI	ADM	...	WY	WYNN	XEL	XOM	XYL	XYZ	YUM	ZBH	ZBRA	ZTS
Date
2005-02-01	0.085482	0.166710	NaN	NaN	0.027219	0.129322	-0.019194	0.085237	0.023127	-0.004132	...	0.072596	0.091672	-0.025838	0.226938	NaN	NaN	0.054695	0.089410	-0.020813	NaN
2005-03-01	-0.075000	-0.071111	NaN	NaN	0.013699	-0.034716	-0.054794	0.087773	-0.014175	0.023453	...	0.030018	-0.053514	-0.030475	-0.053990	NaN	NaN	0.062116	-0.094179	-0.047724	NaN
2005-04-01	-0.065316	-0.134629	NaN	NaN	0.054482	-0.001249	-0.101449	-0.114461	-0.056170	-0.268104	...	0.001606	-0.218482	0.012446	-0.043121	NaN	NaN	-0.093611	0.046396	0.005685	NaN
2005-05-01	0.157109	0.102606	NaN	NaN	-0.013048	0.116529	0.072811	0.113839	0.087071	0.103392	...	-0.065005	-0.115036	0.072760	-0.014554	NaN	NaN	0.094341	-0.059445	-0.108878	NaN
2005-06-01	-0.041232	-0.074195	NaN	NaN	0.015962	0.008959	-0.026203	-0.136171	0.008941	0.081786	...	-0.000419	0.008965	0.059143	0.027806	NaN	NaN	0.015402	-0.005354	0.028900	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2025-07-01	-0.027116	0.011698	0.018317	0.000529	-0.072201	-0.054805	-0.106360	-0.075450	-0.051959	0.026525	...	-0.024912	0.163980	0.087388	0.035622	0.117965	0.137347	-0.027197	0.007430	0.099429	-0.065149
2025-08-01	0.096809	0.118370	0.122724	-0.014198	0.055991	0.063560	-0.021826	-0.002768	0.118773	0.156146	...	0.032735	0.162524	-0.014297	0.023737	-0.021159	0.030805	0.019563	0.157665	-0.064686	0.076363
2025-09-01	0.021407	0.098126	0.100475	-0.069792	0.009649	-0.008740	-0.051429	-0.011074	-0.022323	-0.038115	...	-0.033929	0.014309	0.114104	-0.004294	0.044851	-0.092541	0.034225	-0.071631	-0.062853	-0.064450
2025-10-01	0.142609	0.061815	-0.058305	0.042168	-0.077049	-0.048716	0.014193	-0.035266	-0.043321	0.013224	...	-0.072207	-0.072347	0.014330	0.014279	0.022712	0.050782	-0.086310	0.023413	-0.093922	-0.015241
2025-11-01	0.048784	0.031364	0.051832	-0.075470	0.047348	0.088171	-0.000400	-0.059299	0.133302	0.003469	...	-0.034348	0.081435	0.011581	0.013641	-0.067484	-0.120358	0.108530	-0.030231	-0.061281	-0.107320

	Mkt-RF	SMB	HML	RMW	CMA	RF
Date
2005-01-01	-0.027735	-0.011014	0.021046	0.028971	-0.014525	0.002002
2005-02-01	0.018765	-0.002827	0.014022	0.012924	-0.002025	0.001902
2005-03-01	-0.019641	-0.014406	0.021232	0.004640	0.013652	0.002202
2005-04-01	-0.026120	-0.040480	0.000432	0.009965	-0.009089	0.002102
2005-05-01	0.036674	0.027503	-0.005475	-0.009930	0.002468	0.002102
...	...	...	...	...	...	...
2025-06-01	0.048675	0.000082	-0.015139	-0.029946	0.013773	0.004008
2025-07-01	0.019862	-0.000945	-0.012762	-0.003399	-0.020352	0.004409
2025-08-01	0.018490	0.048683	0.042643	-0.006815	0.019643	0.004208
2025-09-01	0.034065	-0.020728	-0.010410	-0.020575	-0.021309	0.004208
2025-10-01	0.019272	-0.012656	-0.030175	-0.051894	-0.039484	0.004610

	Mkt-RF	SMB	HML	RMW	CMA	RF
count	250.000000	250.000000	250.000000	250.000000	250.000000	250.000000
mean	0.008269	-0.000589	-0.000849	0.002576	-0.000159	0.001418
std	0.044389	0.026621	0.032316	0.019001	0.019188	0.001740
min	-0.172062	-0.093032	-0.146437	-0.051894	-0.069334	0.000000
25%	-0.017638	-0.018432	-0.017448	-0.007863	-0.012382	0.000000
50%	0.013542	-0.000866	-0.003298	0.002644	-0.001038	0.000000
75%	0.033872	0.014471	0.013992	0.012318	0.010395	0.002303
max	0.136169	0.081782	0.137614	0.076685	0.083789	0.004610

	date	Mkt-RF	SMB	HML	RMW	CMA	RF
0	1963-07-01	-0.004035	-0.004712	-0.008305	0.006493	-0.011779	0.002202
1	1963-08-01	0.050698	-0.007507	0.016490	0.003688	-0.003525	0.002202
2	1963-09-01	-0.015857	-0.004544	0.000067	-0.007802	0.001667	0.002002
3	1963-10-01	0.025157	-0.013107	-0.000324	0.027383	-0.021471	0.002303
4	1963-11-01	-0.008503	-0.009033	0.018088	-0.004348	0.022234	0.003606

	A	AAPL	ABBV	ABNB	ABT	ACGL	ACN	ADBE	ADI	ADM	...	WY	WYNN	XEL	XOM	XYL	XYZ	YUM	ZBH	ZBRA	ZTS
2008-02-01	-0.004097	0.012060	NaN	NaN	0.002053	0.006814	0.001764	0.004358	0.001814	-0.003026	...	0.002472	-0.001584	0.002107	0.009974	NaN	NaN	0.004821	0.003838	-0.003822	NaN
2008-03-01	-0.006852	0.008220	NaN	NaN	0.001438	0.003900	0.000507	0.001335	0.000314	-0.002101	...	0.000189	-0.004799	0.001783	0.005570	NaN	NaN	0.002926	0.001098	-0.003738	NaN
2008-04-01	-0.007696	0.009685	NaN	NaN	0.000855	0.003922	0.000536	0.002880	0.001590	-0.000558	...	0.000825	-0.004194	0.001776	0.006196	NaN	NaN	0.004970	-0.000228	-0.003104	NaN
2008-05-01	-0.006045	0.014419	NaN	NaN	-0.000565	0.004876	0.003248	0.005533	0.006201	0.002404	...	0.001862	-0.000875	0.003500	0.009065	NaN	NaN	0.009009	-0.000877	0.000259	NaN
2008-06-01	-0.005518	0.013753	NaN	NaN	0.000181	0.004504	0.003254	0.006447	0.006669	0.001348	...	0.000065	-0.005503	0.003283	0.007945	NaN	NaN	0.008464	-0.001886	-0.000301	NaN

	AAPL	ACGL	...	XOM	YUM
2008-02-01	0.02	0.02	...	0.02	0.00
2008-03-01	0.02	0.00	...	0.02	0.00
2008-04-01	0.02	0.00	...	0.02	0.02
2008-05-01	0.02	0.00	...	0.02	0.02
2008-06-01	0.02	0.00	...	0.00	0.00

	Start	End	Months	Sharpe (Excess)	CAGR	Vol (ann.)	Max DD	Calmar
Model
SPY	2011-02-01	2025-10-01	177	0.884	14.00%	14.44%	-23.97%	0.584
Linear	2011-02-01	2025-10-01	177	0.969	18.71%	18.03%	-32.46%	0.576
XGB	2011-02-01	2025-10-01	177	1.085	25.06%	21.59%	-27.06%	0.926

	SPY	Linear	XGB
2011	-1.05%	-4.40%	-11.07%
2012	15.90%	17.47%	33.25%
2013	32.53%	24.37%	55.60%
2014	13.45%	27.13%	28.38%
2015	1.19%	5.23%	16.73%
2016	12.00%	14.37%	14.09%
2017	21.80%	17.57%	28.27%
2018	-4.64%	3.69%	-7.15%
2019	31.34%	48.76%	45.48%
2020	18.41%	43.40%	53.55%
2021	28.82%	23.22%	44.18%
2022	-18.26%	-25.41%	-14.82%
2023	26.24%	31.61%	41.60%
2024	24.97%	46.55%	31.92%
2025	17.79%	27.41%	40.61%

	Features	Objective/Label	Portfolio Rule	Start	End	Months	Sharpe	CAGR	Max DD
Model
SPY (Benchmark)	N/A	Passive benchmark (S&P 500 ETF)	Buy & Hold	2008-02-01	2025-10-01	213	0.686	11.51%	-46.32%
Linear (LogReg)	5 rolling FF5 betas: MKT, SMB, HML, RMW, CMA	Binary classification: P(y_excess > 0) via Logistic Regression	Top-50 by predicted probability, equal weight	2011-02-01	2025-10-01	177	0.969	18.71%	-32.46%
XGBRanker (pairwise)	5 rolling FF5 betas: MKT, SMB, HML, RMW, CMA	Pairwise ranking: rel bins from y_excess; bin 0 for y_excess ≤ 0	Top-50 by ranking score, equal weight	2011-02-01	2025-10-01	177	1.085	25.06%	-27.06%

	ticker	score
0	COIN	0.306163
1	TSLA	0.199746
2	FSLR	0.191979
3	PLTR	0.176649
4	DXCM	0.149607
5	HOOD	0.140152
6	WDC	0.139537
7	NVDA	0.137747
8	NCLH	0.122105
9	LRCX	0.118047
10	CRWD	0.114007
11	APP	0.113223
12	NUE	0.108160
13	SMCI	0.107619
14	CCL	0.103246
15	DASH	0.100171
16	WBD	0.099168
17	MU	0.098461
18	XYZ	0.098406
19	ORCL	0.081728
20	KKR	0.078721
21	TPR	0.071457
22	RCL	0.056819
23	INTC	0.046505
24	ALGN	0.041766
25	NRG	0.040935
26	UBER	0.038666
27	AMAT	0.035519
28	LVS	0.033460
29	DAL	0.024741
30	ON	0.024424
31	WSM	0.020018
32	CEG	0.017563
33	IVZ	0.017528
34	AVGO	0.015947
35	PODD	0.004497
36	AMD	0.002139
37	ISRG	-0.008672
38	BX	-0.012321
39	EXPE	-0.014314
40	STLD	-0.015703
41	RL	-0.017430
42	IT	-0.020022
43	DPZ	-0.023975
44	TER	-0.034858
45	QCOM	-0.035040
46	TPL	-0.037646
47	KLAC	-0.041326
48	VST	-0.044133
49	NTAP	-0.046344

Factor-Based Portfolio Strategy Using Fama–French Five Factors¶

Introduction¶

Fama-French 5 Factors Overview¶

Interpreting Positive vs Negative Values¶

Data Curation¶

Data Sources¶

Data Preparation¶

Exploratory Data Analysis¶

Data Exploration¶

Plot FF factors with historical context¶

Statistical Method #1 – Distribution & Summary Statistics¶

Statistical Method #2 – Outliers via Boxplots + IQR Rule¶

Statistical Method #3 – Correlation & Hypothesis Test (SPY vs FF5 Factors)¶

Key Learnings from Correlation Analysis¶

Formal hypothesis test on correlation (Method #3)¶

Factor-Based Stock Selection vs the S&P 500 (FF5 Rolling Strategy)¶

Import Libraries¶

Load and Prepare Fama–French 5-Factor Data¶

Load Market Return Data (Monthly)¶

Merge Market Returns with Factor Data¶

Compute Excess Returns for Each Stock¶

Ordinary Least Squares (OLS) via NumPy¶

Rolling-Window Factor Regression¶

Run Rolling-Window Regressions Across All S&P 500 Tickers¶

Predict Returns Using Rolling Betas (Signal Generation)¶

Long-Only Portfolio Construction¶

Backtest & Benchmarking (Strategy vs SPY)¶

Realized Monthly Portfolio Returns¶

Load SPY Benchmark Returns¶

Performance Evaluation: Sharpe Ratio & Cumulative Returns¶

Two-Model ML Comparison (Linear vs XGBoost)¶

Walk-forward training + monthly top-K selection¶

Model Comparison (Model 1: Linear vs Model 2: XGBoost vs SPY)¶

Risk-Adjusted Performance¶

Results¶

Cumulative Returns¶

Model Comparison¶

Transaction costs (what they were, and why they are disabled)¶

Risk-Adjusted Performance¶

Insights and Conclusions¶

Key Findings¶

Application¶

Limitations¶

Future Work¶

Data Science Ethics¶