# Packages

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from sklearn.linear_model import Ridge
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler

from tqdm.notebook import tqdm

import lightgbm as lgb

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)  # silences some pandas warnings

# plotting config
%matplotlib inline
# save_plots = False # plots for the report

# Load the full dataset (takes a few minutes)
df = pd.read_csv("./df_contest_train_main.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3838753 entries, 0 to 3838752
Columns: 110 entries, as_of_date to GICS_code_prefix
dtypes: float64(103), int64(3), object(4)
memory usage: 3.1+ GB

# Examining the columns

# dictionary table has the column descriptions
dic_table = pd.read_csv("./data_dictionary.csv", sep=';')

for key in df.keys():
    if key not in list(dic_table['Field']):
        print(key, " is not in data_dictionary")

GICS_code  is not in data_dictionary
id_security_mapped  is not in data_dictionary
GICS_code_prefix  is not in data_dictionary

df[['id_security_mapped','GICS_code','GICS_code_prefix']].sample(5)

df.rename(columns={"id_security_mapped":"id_security"}, inplace=True)
df['id_security'] = df.id_security.astype('category')
df['GICS_code'] = df.GICS_code.astype('category')
df['GICS_code_prefix'] = df.GICS_code_prefix.astype('category')

df_company = df[df['id_security'] == 1105].copy() # I found that this id illustrates the point well

# Convert to datetime
df_company['as_of_date'] = pd.to_datetime(df_company['as_of_date'])
df_company['datadate'] = pd.to_datetime(df_company['datadate'])

# select the 5 last as_of_dates
latest_as_of_dates = df_company['as_of_date'].drop_duplicates().sort_values().tail(5)

# Plot the revenues
fig = plt.figure(figsize = (8,4))

for as_of in latest_as_of_dates:
    subset = df_company[df_company['as_of_date'] == as_of].sort_values('datadate')
    plt.plot(subset['datadate'], subset['revtq'], marker='o', label=as_of.date())

plt.legend(title = 'as_of_date', loc='best')
plt.xlabel('datadate (reported quarter)')
plt.ylabel('Revenue (GBP)')

if save_plots:
    plt.title('')
    fig.savefig('plots/revs_date.pdf', bbox_inches='tight')

plt.title('Quarterly Revenue Histories (One Line per as_of_date)')
plt.show()

# Sort so that the most recent datadates come first (per group)
df_sorted = df.sort_values(['id_security', 'as_of_date', 'datadate'], ascending=[True, True, False])

# Keep only the 4 most recent datadate entries per (id_security, as_of_date)
df_filtered = (
    df_sorted
    .groupby(['id_security', 'as_of_date'], 
             observed = True) # this is just to silence a warning
    .head(4)   # keep top 4 rows per group
    .reset_index(drop=True)
)

df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 658855 entries, 0 to 658854
Columns: 110 entries, as_of_date to GICS_code_prefix
dtypes: category(3), float64(103), object(4)
memory usage: 541.1+ MB

# alias back to "df" for readability
df = df_filtered.copy()

df['id_security'] = df.id_security.astype('category')
df['GICS_code'] = df.GICS_code.astype('category')
df['GICS_code_prefix'] = df.GICS_code_prefix.astype('category')

# Easier to perform this conversion now
df['datadate'] = pd.to_datetime(df['datadate'])
df['as_of_date'] = pd.to_datetime(df['as_of_date'])

# The total number of rows in the dataset
Ntot = len(df)

df_company = df[df['id_security'] == 1105].copy()
test_as_of_dates = df_company['as_of_date'].drop_duplicates().sample(4, random_state=42)

plt.figure(figsize=(8,4))
colors = ['k','b','r','g']

for i, as_of in enumerate(test_as_of_dates):

    plt.axvline(as_of, lw=0.5, ls='--', color=colors[i])

    # Past revenues
    past_data = df_company[(df_company['as_of_date'] == as_of) & (df_company['datadate'] <= as_of)]
    past_data = past_data.sort_values('datadate')
    past_dates = past_data['datadate'].values[-4:]
    past_revs = past_data['revtq'].values[-4:]
    plt.plot(past_dates, past_revs, 'o-', color=colors[i], label="True past revenues" if i==0 else None)

    # True future revenues
    fut_data = df_company[df_company['datadate'] > as_of].drop_duplicates('datadate')
    fut_data = fut_data.sort_values('datadate')
    fut_dates = fut_data['datadate'].values[:4]
    fut_revs = fut_data['revtq'].values[:4]
    plt.plot(fut_dates, fut_revs, 's-', color=colors[i], label="True future revenues" if i==0 else None)

    # MA4 forecasted revenues
    forecast = np.ones(4) * np.mean(past_revs)
    plt.plot(fut_dates, forecast, '-', lw=2, color=colors[i], label="Moving average forecast" if i==0 else None)

    # display the errors
    mse = mean_squared_error(fut_revs, forecast)
    mape = mean_absolute_percentage_error(fut_revs, forecast)

    plt.text(fut_dates[0], forecast[0]-20, f"MSE={mse:.1f}\nMAPE={mape*100:.1f}%", color=colors[i], fontsize=7, alpha=0.8)

plt.legend()
plt.xlabel('datadate (reported quarter)')
plt.ylabel('Revenue (GBP)')
plt.show()

def good_col(col):
    # We consider a column good if the fraction of NA values is less than 20%
    if df[col].isna().sum()/Ntot < 0.2:
        return True
    return False

# initialize the features with the price
feature_cols = ['prccd_as_of_date_adj_GBP']

candidate_cols = df.columns
exclude_cols = ['as_of_date','id_security','datadate','is_final_q','is_final_a'] # not used for regression

# Additionally, we don't include any column which includes "date" (as_of_date or data_date)
# These refer to the price of the security in different currencies as well as exchange rates. we don't need those.
for col in candidate_cols:
    if 'date' in col and col not in exclude_cols:
        exclude_cols.append(col)
candidate_cols = [col for col in candidate_cols if col not in exclude_cols]

quarterly_cols = [col for col in candidate_cols if col.endswith('q')]
for col in quarterly_cols:
    if good_col(col):
        feature_cols.append(col)
        
print("Feature columns")
print('\t' + feature_cols[0])
for i in range(1,len(feature_cols)//4):
    print('\t' + ', '.join(feature_cols[i:i+5]))

Feature columns
	prccd_as_of_date_adj_GBP
	actq, aoq, apq, atq, ceqq
	aoq, apq, atq, ceqq, cheq
	apq, atq, ceqq, cheq, dlcq
	atq, ceqq, cheq, dlcq, dlttq
	ceqq, cheq, dlcq, dlttq, icaptq
	cheq, dlcq, dlttq, icaptq, intanq
	dlcq, dlttq, icaptq, intanq, invtq

def add_lagged_features(df, lag_columns, max_lag=2):
    """
    Adds lagged versions of selected features within each as_of_date snapshot.
    For example, for revtq and max_lag=2, creates revtq_lag1 and revtq_lag2.
    """
    df = df.copy()
    df = df.sort_values(['id_security', 'as_of_date', 'datadate'])

    for col in lag_columns:
        for lag in range(1, max_lag + 1):
            lagged_name = f"{col}_lag{lag}"
            df[lagged_name] = (df.groupby(['id_security','as_of_date'])[col]
                                  .shift(lag))
    return df

# Demo of add_lagged_features, to prove that there is no info leakage between ids or as_of_dates
# Uncomment below to see the demo

# # Filter to only the revtq feature, and the first 2 companies
# demo_df = df[['id_security', 'as_of_date', 'datadate', 'revtq']].dropna().sort_values(['id_security', 'as_of_date', 'datadate'])
# demo_df = demo_df[demo_df['id_security'].isin([1,2])]

# # Apply lag
# demo_lagged = add_lagged_features(demo_df, ['revtq'], max_lag=2)

# # Display
# print(demo_lagged.sort_values(['id_security', 'as_of_date', 'datadate']).to_string(max_rows=20))

lag_vars = ['revtq', 'ibq', 'atq', 'cheq', 'xoprq']
max_lag = 2
df = add_lagged_features(df, lag_columns=lag_vars, max_lag=max_lag)

lag_feature_cols = [f"{col}_lag{lag}" for col in lag_vars for lag in range(1, max_lag+1)]

print(len(pd.unique(df['id_security'])), ' unique securities')
print(len(pd.unique(df['GICS_code'])), ' unique GICS codes')
print(len(pd.unique(df['GICS_code_prefix'])), ' unique GICS code prefixes')

1949  unique securities
209  unique GICS codes
11  unique GICS code prefixes

# Add the one-hot encoded variables
one_hot = pd.get_dummies(df['GICS_code_prefix'], prefix='sector', drop_first=True)

categorical_linear_cols = one_hot.columns.tolist()
categorical_non_linear_cols = ['GICS_code_prefix', 'id_security']

# Add into the master df
df = pd.concat([df, one_hot], axis=1)

macro_dir = './FRED_macros/'
macro_vars = ['CPIAUCSL', 'UNRATE', 'DCOILWTICO', 'GDPC1', 'DTWEXM']

macro_df_all = None

for var in macro_vars:
    macro_df = pd.read_csv(macro_dir + f"{var}.csv")
    macro_df.rename(columns={"observation_date":"date"}, inplace=True)
    macro_df.columns = ['date', var]
    macro_df['date'] = pd.to_datetime(macro_df['date'])

    # resample over quarters
    macro_df = (
        macro_df.set_index('date')
            .resample('QE')  # 'QE' = calendar quarter end
            .mean()
            .reset_index()
    )

    if macro_df_all is None:
        macro_df_all = macro_df
    else:
        # merge on date
        macro_df_all = macro_df_all.merge(macro_df, on='date', how='outer')

# Add lags
macro_lag_cols = []

for var in macro_vars:
    macro_df_all[f'{var}_lag1'] = macro_df_all[var].shift(1) # lag 1 quarter
    macro_df_all[f'{var}_lag2'] = macro_df_all[var].shift(2) # lag 2 quarters

    # Drop the current quarter
    macro_df_all.drop(columns=var, inplace=True)

    macro_lag_cols.append(f'{var}_lag1')
    macro_lag_cols.append(f'{var}_lag2')

macro_df_all.head()

# Merge into the df by quarter end

# Step 1: Align company data's date to quarter-end
df['as_of_quarter'] = df['as_of_date'] + pd.tseries.offsets.QuarterEnd(0)

# Step 2: Merge in macro features using quarter-end as key
df = df.merge(macro_df_all, left_on='as_of_quarter', right_on='date', how='left')

# Step 3: Drop helper columns
df.drop(columns=['date','as_of_quarter'], inplace=True)

# drop accidental duplicate column names
df = df.loc[:,~df.columns.duplicated()].copy()

# recap
print("Feature recap")
print(len(feature_cols), " feature columns")
print(len(lag_feature_cols), "lagged feature columns")
print(len(categorical_linear_cols), "categorical features for linear models (1 one-hot encoded feature)")
print(len(categorical_non_linear_cols), "categorical feature columns for non-linear models")
print(len(macro_lag_cols), "lagged macroeconomic columns")

Feature recap
35  feature columns
10 lagged feature columns
10 categorical features for linear models (1 one-hot encoded feature)
2 categorical feature columns for non-linear models
10 lagged macroeconomic columns

max_companies = 25
max_features = 25

# Compute % of NaNs per (company, feature)
nan_rates = df.groupby('id_security')[feature_cols].apply(lambda g: g.isna().mean())
# nan_rates = df.groupby('id_security')[feature_cols + lag_feature_cols + macro_lag_cols].apply(lambda g: g.isna().mean())

# Filter top N companies with the most data
top_companies = df['id_security'].value_counts().head(max_companies).index
nan_rates = nan_rates.loc[nan_rates.index.isin(top_companies)]

# Optionally select most problematic features
if max_features is not None:
    feature_nans = nan_rates.mean(axis=0).sort_values(ascending=False).head(max_features).index
    nan_rates = nan_rates[feature_nans]

# Plot heatmap
fig = plt.figure(figsize=(8, 6))
sns.heatmap(nan_rates, cmap="Reds", cbar_kws={"label": "% missing"})
plt.xlabel("Feature")
plt.ylabel("Company (id_security)")

if save_plots:
    fig.savefig("plots/nan_heatmap.pdf", bbox_inches="tight")

plt.title("Fraction of Missing Values per (Company, Feature)")
plt.tight_layout()
plt.show()

# How many companies are "alive" during a given time period
fig,ax = plt.subplots(1, 1, figsize=(5, 3), sharey=False)
ax.hist(df['as_of_date'], bins=20, edgecolor='black')
ax.set_xlabel('Year')
ax.set_ylabel('Number of Live Companies')

plt.tight_layout()
plt.show()

def assign_folds(df, n_folds=5, test_start='2016-01-01'):
    df = df.copy()
    df = df.sort_values('as_of_date')

    df['fold'] = -1

    # Fold 99 for the testing set
    df.loc[df['as_of_date'] >= pd.Timestamp(test_start), 'fold'] = 99

    # Only use pre-test rows for CV folds
    train_idx = df[df['fold'] == -1].index
    fold_sizes = len(train_idx) // n_folds  # divide the data into folds of equal sizes

    edge_dates = [df.loc[train_idx[0], 'as_of_date']]
    
    start, end = 0, -1
    for k in range(n_folds):
        start = end + 1
        end += fold_sizes
        fold_idx = train_idx[start:end+1]
        df.loc[fold_idx, 'fold'] = k
        edge_dates.append(df.loc[train_idx[end], 'as_of_date'])

    edge_dates.append(pd.Timestamp(test_start))

    return df, edge_dates

n_folds = 5
test_start = '2016-01-01'
df, edge_dates = assign_folds(df, n_folds=n_folds, test_start = test_start)

# Plot for the folds

fig,ax = plt.subplots(1, 1, figsize=(10, 4), sharey=False)
ax.hist(df['as_of_date'], bins=40, edgecolor='black', alpha=0.7)
# ax.hist(df['datadate'], bins=20, edgecolor='black') # similar
ax.set_xlabel('Year')
ax.set_ylabel('Number of Live Companies')
ax.set_ylim(10000, 20000)

for i in range(n_folds):
    ax.axvspan(edge_dates[i], edge_dates[i+1], color='k', alpha=0.2)
    ax.text(edge_dates[i]+pd.Timedelta(600, "D"), 18000, f"Fold #{i+1}", ha='center', fontsize=9)

ax.axvspan(edge_dates[-1], pd.Timestamp('2018-01-01'), color='g', alpha=0.4)
ax.text(pd.Timestamp('2017-01-01'), 18000, "Final\ntesting set", ha='center', fontsize=9)

if save_plots:
    fig.savefig("plots/kfold_split.pdf", bbox_inches="tight")
    
plt.tight_layout()
plt.show()

def build_training_rows_fast(df, feature_cols, n_horizon=4):
    
    df = df.sort_values(['id_security', 'as_of_date', 'datadate'])
    rows = []

    # Precompute: for each company, store the most recent revtq per future datadate  (chatgpt optimization!)
    future_targets = {}

    for cid, group in df.groupby('id_security'):
        # Build future map: datadate -> latest revtq (based on most recent as_of_date)
        future = (group[group['revtq'].notna()]
                  .sort_values(['datadate', 'as_of_date'])
                  .drop_duplicates('datadate', keep='last'))

        rev_map = dict(zip(future['datadate'], future['revtq']))
        future_targets[cid] = rev_map

    # Now iterate and build rows
    # tqdm for progress bar
    for cid, group in tqdm(df.groupby('id_security'), desc="Building training rows"):
        snapshots = group['as_of_date'].sort_values().unique()
        rev_map = future_targets[cid]

        for t in snapshots:
            snap_now = group[group['as_of_date'] == t]
            latest = snap_now[snap_now['datadate'] <= t]
            if latest.empty:
                continue
            latest = latest.sort_values('datadate').iloc[-1]
            X = latest[feature_cols]

            # Get 4 unique datadates after t from any row
            next_quarters = (group[group['datadate'] > t]
                             .sort_values('datadate')['datadate']
                             .drop_duplicates()
                             .head(n_horizon))

            if len(next_quarters) < n_horizon:
                continue

            y = [rev_map.get(d, np.nan) for d in next_quarters]  # returns nan if date missing instead of raising an error
            if any(pd.isna(y)):
                continue

            row = pd.Series(
                X.values.tolist() + y,
                index=feature_cols + [f'revtq+{k}' for k in range(1, n_horizon+1)]
            )

            # still need this metadata for indexing and analyzing
            row['as_of_date'] = t
            row['id_security'] = cid
            row['fold'] = latest.get('fold', -1)
            rows.append(row)

    return pd.DataFrame(rows)

# This cell takes a long time
# What is taking time is finding the next 4 quarter revenues for each as_of_date. For every as_of_date, we parse every future as_of_date
# and record available datadates. We then select the next 4 distinct datadates and their revenues. That is what needs to be predicted.
# When two future as_of_date contain the same datadate, we select the most recent one, as it should be the "most" accurate information

# Full feature set
full_feature_cols = (
    feature_cols +
    lag_feature_cols +
    macro_lag_cols +
    categorical_linear_cols +       # one-hot encoded sector columns for Ridge
    categorical_non_linear_cols     # categorical columns for LightGBM
)

train_df = build_training_rows_fast(df, feature_cols=full_feature_cols, n_horizon=4)
# train_df.info()

target_cols = ['revtq+1', 'revtq+2', 'revtq+3', 'revtq+4'] # these now exist in train_df

Building training rows:   0%|          | 0/1949 [00:00<?, ?it/s]

def safe_mape(y_true, y_pred, eps=1e-5):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)

    denom = np.clip(np.abs(y_true), eps, None)  # avoid division by small or negative numbers
    return np.mean(np.abs((y_true - y_pred) / denom))

def evaluate_models(train_df, 
                    feature_cols_linear, feature_cols_lgb, categorical_cols_lgb, 
                    ridge_alpha=1.0, 
                    lgb_params=None):
    
    folds = [i for i in range(n_folds)]

    train_df = train_df[train_df['fold'] != -1] # exclude potential errors in folding (-1 was the default value of the fold)

    qmape_ma, qmape_ridge, qmape_lgb = [], [], [] # store the qmape errors for later plot

    for fold in folds[1:]:
        train = train_df[train_df['fold'] < fold]
        val = train_df[train_df['fold'] == fold]

        if train.empty or val.empty:
            print(f"Skipping fold {fold} due to empty train/val")
            continue

        print(f"Fold {fold}: train up to {train['as_of_date'].max().date()} | val = {val['as_of_date'].min().date()} -> {val['as_of_date'].max().date()}")

        ### --- Targets ---
        y_train = train[target_cols].values
        y_val = val[target_cols].values

        ### --- Baseline: Moving Average (per company) ---
        valid_ma_mask = val[['revtq_lag1', 'revtq_lag2']].notna().all(axis=1)
        ma_val = val.loc[valid_ma_mask, ['revtq_lag1', 'revtq_lag2']].mean(axis=1).values
        y_pred_ma = np.tile(ma_val.reshape(-1, 1), 4)
        y_val_ma = y_val[valid_ma_mask]

        ### --- Ridge Regression ---
        X_train_lr = train[feature_cols_linear].values
        X_val_lr = val[feature_cols_linear].values

        # Feature scaling (zero mean, unit variance)
        scaler = StandardScaler()
        X_train_lr = scaler.fit_transform(X_train_lr).astype(float)
        X_val_lr = scaler.transform(X_val_lr).astype(float)

        # Drop rows with NaNs from training set
        y_train = y_train.astype(float)
        y_val = y_val.astype(float)
        
        train_mask = ~np.isnan(X_train_lr).any(axis=1) & ~np.isnan(y_train).any(axis=1)
        X_train_lr_clean = X_train_lr[train_mask]
        y_train_clean = y_train[train_mask]
        
        val_mask = ~np.isnan(X_val_lr).any(axis=1) & ~np.isnan(y_val).any(axis=1)
        X_val_lr_clean = X_val_lr[val_mask]
        y_val_lr_clean = y_val[val_mask]

        if len(X_train_lr_clean) == 0 or len(y_train_clean) == 0:
            print(f"Skipping fold {fold} (Ridge) due to NaNs in training set")
            continue

        model_ridge = MultiOutputRegressor(Ridge(alpha=ridge_alpha))
        model_ridge.fit(X_train_lr_clean, y_train_clean)
        y_pred_ridge = model_ridge.predict(X_val_lr_clean)

        ### --- LightGBM ---
        X_train_lgb = train[feature_cols_lgb]
        X_val_lgb = val[feature_cols_lgb]

        model_lgb = MultiOutputRegressor(
            lgb.LGBMRegressor(**(lgb_params or {}))
        )
        model_lgb.fit(X_train_lgb, y_train, categorical_feature=categorical_cols_lgb)
        y_pred_lgb = model_lgb.predict(X_val_lgb)

        # --- Evaluate All ---
        for y_pred, y_val_val, name, store in zip(
            (y_pred_ma, y_pred_ridge, y_pred_lgb),
            (y_val_ma, y_val_lr_clean, y_val),
            ("Moving Avg", "Ridge", "LightGBM"),
            (qmape_ma, qmape_ridge, qmape_lgb)
        ):

            print(name)
            # overall errors
            mse = mean_squared_error(y_val_val, y_pred)
            # mape = mean_absolute_percentage_error(y_val_val, y_pred)
            mape = safe_mape(y_val_val, y_pred)
    
            # per quarter errors
            per_quarter_mse = [mean_squared_error(y_val_val[:, i], y_pred[:, i]) for i in range(4)]
            # per_quarter_mape = [mean_absolute_percentage_error(y_val_val[:, i], y_pred[:, i]) for i in range(4)]
            per_quarter_mape = [safe_mape(y_val_val[:, i], y_pred[:, i]) for i in range(4)]
            store.append(per_quarter_mape)
    
            # mse, mape, per_quarter_mse, per_quarter_mape = error_eval(y_val, y_pred)
            # store.append(per_quarter_mape)
    
            print(f"\t MSE  = {mse:.2f}  (quarters: {' | '.join(f'{x:.2f}' for x in per_quarter_mse)})")
            print(f"\t MAPE = {mape:.3f} (quarters: {' | '.join(f'{x:.3f}' for x in per_quarter_mape)})")

        print()

        # break

    return {
        'baseline': qmape_ma,
        'ridge': qmape_ridge,
        'lgb': qmape_lgb
    }

# Define feature sets
linear_features = feature_cols + lag_feature_cols + macro_lag_cols + categorical_linear_cols
lgb_features = feature_cols + lag_feature_cols + macro_lag_cols + categorical_non_linear_cols
categorical_lgb = categorical_non_linear_cols

# lgb hyperparameter tuning
lgb_params={
    'n_estimators': 300,
    'learning_rate': 0.1,
    'max_depth': 6,
    'verbosity': -1
}

# Run the evaluation
results = evaluate_models(
    train_df,
    feature_cols_linear=linear_features,
    feature_cols_lgb=lgb_features,
    categorical_cols_lgb=categorical_lgb,
    ridge_alpha=1.0,
    lgb_params=lgb_params
)

Fold 1: train up to 2001-08-31 | val = 2001-08-31 -> 2005-03-31
Moving Avg
	 MSE  = 28136.53  (quarters: 16440.78 | 25546.41 | 32038.48 | 38520.44)
	 MAPE = 1308.584 (quarters: 662.456 | 1148.459 | 1466.858 | 1956.563)
Ridge
	 MSE  = 165985.35  (quarters: 146917.73 | 75390.22 | 39817.86 | 401815.58)
	 MAPE = 22993.431 (quarters: 4023.028 | 12226.701 | 8301.897 | 67422.099)
LightGBM
	 MSE  = 21788.21  (quarters: 16985.78 | 20708.14 | 28734.98 | 20723.94)
	 MAPE = 2393.870 (quarters: 866.391 | 2156.369 | 3009.078 | 3543.643)

Fold 2: train up to 2005-03-31 | val = 2005-03-31 -> 2008-10-31
Moving Avg
	 MSE  = 18570.29  (quarters: 13650.70 | 16227.56 | 20445.79 | 23957.13)
	 MAPE = 0.251 (quarters: 0.209 | 0.235 | 0.269 | 0.292)
Ridge
	 MSE  = 28339.09  (quarters: 13352.01 | 25167.69 | 34764.18 | 40072.48)
	 MAPE = 1.006 (quarters: 0.570 | 0.929 | 1.251 | 1.273)
LightGBM
	 MSE  = 37527.35  (quarters: 34516.93 | 34585.22 | 39274.37 | 41732.88)
	 MAPE = 0.284 (quarters: 0.268 | 0.281 | 0.279 | 0.307)

Fold 3: train up to 2008-10-31 | val = 2008-10-31 -> 2012-06-29
Moving Avg
	 MSE  = 13203.77  (quarters: 10495.69 | 11282.78 | 14451.92 | 16584.70)
	 MAPE = 0.207 (quarters: 0.176 | 0.189 | 0.220 | 0.243)
Ridge
	 MSE  = 17107.44  (quarters: 17723.63 | 20465.20 | 13774.35 | 16466.57)
	 MAPE = 0.587 (quarters: 0.700 | 0.669 | 0.518 | 0.460)
LightGBM
	 MSE  = 18574.04  (quarters: 13653.61 | 21463.40 | 18735.62 | 20443.52)
	 MAPE = 0.233 (quarters: 0.204 | 0.238 | 0.242 | 0.249)

Fold 4: train up to 2012-06-29 | val = 2012-06-29 -> 2015-12-31
Moving Avg
	 MSE  = 26412.99  (quarters: 19444.42 | 21935.93 | 28872.73 | 35398.88)
	 MAPE = 0.215 (quarters: 0.180 | 0.192 | 0.228 | 0.258)
Ridge
	 MSE  = 15231.74  (quarters: 11693.09 | 13155.19 | 15392.09 | 20686.60)
	 MAPE = 0.357 (quarters: 0.278 | 0.360 | 0.401 | 0.390)
LightGBM
	 MSE  = 41625.88  (quarters: 29990.21 | 38761.99 | 40455.55 | 57295.80)
	 MAPE = 0.198 (quarters: 0.175 | 0.189 | 0.198 | 0.232)

target_cols = [f'revtq+{k}' for k in range(1, 5)]

# Train/val split
train_df = train_df[train_df['fold'] != -1]
train_data = train_df[train_df['fold'] < 5].copy()
test_data  = train_df[train_df['fold'] == 99].copy()

# Prepare inputs
X_train = train_data[lgb_features]
X_test  = test_data[lgb_features]
y_train = train_data[target_cols].values
y_test  = test_data[target_cols].values

# Fit model
model = MultiOutputRegressor(lgb.LGBMRegressor(**lgb_params))
model.fit(X_train, y_train, categorical_feature=categorical_lgb)
y_pred = model.predict(X_test)

# Errors
mse_total = mean_squared_error(y_test, y_pred)
mape_total = mean_absolute_percentage_error(y_test, y_pred)

qmse = [mean_squared_error(y_test[:, i], y_pred[:, i]) for i in range(4)]
qmape = [mean_absolute_percentage_error(y_test[:, i], y_pred[:, i]) for i in range(4)]

# Report
print("Final Evaluation on Test Set")
print(f"LightGBM MSE  = {mse_total:.2f} (quarters: " + ' | '.join(f"{x:.2f}" for x in qmse) + ")")
print(f"LightGBM MAPE = {mape_total:.3f} (quarters: " + ' | '.join(f"{x:.3f}" for x in qmape) + ")")

Final Evaluation on Test Set
LightGBM MSE  = 16437.99 (quarters: 16095.23 | 16688.62 | 16785.69 | 16182.42)
LightGBM MAPE = 0.211 (quarters: 0.227 | 0.211 | 0.198 | 0.208)

# Create test set with actual and predicted values
test_df = test_data[['id_security'] + target_cols].copy()

# Add predictions from the last run
for i in range(4):
    test_df[f'y_pred_{i+1}'] = y_pred[:, i]

# Compute MAPE per company
company_mapes = []

for company_id, group in test_df.groupby('id_security'):
    y_true = group[[f'revtq+{i}' for i in range(1, 5)]].values
    y_predicted = group[[f'y_pred_{i}' for i in range(1, 5)]].values
    
    # Flatten arrays (1 row per company)
    y_true_flat = y_true.flatten()
    y_pred_flat = y_predicted.flatten()

    # Avoid div-by-zero; clip true values
    # mape = np.mean(np.abs(y_true_flat - y_pred_flat) / np.clip(y_true_flat, 1, None))
    if len(y_true_flat)==0 or len(y_pred_flat)==0:
        continue
    mape = np.mean(np.abs(y_true_flat - y_pred_flat) / y_true_flat)
    company_mapes.append(mape)

    # mark small errors as well predicted companies
    if mape < 0.05:
        print(f"id_security={company_id} : MAPE={mape:.2f}")

# Plot histogram
fig = plt.figure(figsize=(5, 4))
plt.hist(company_mapes, bins=30, edgecolor='black')
plt.xlabel("MAPE")
plt.ylabel("Number of Companies")
plt.grid(True)

# plot the overall mape from before
assert np.isclose(mape_total, np.mean(company_mapes), rtol=1e-2) # sanity check that the overall mape in both calculations are within 1%
plt.axvline(x=mape_total, ymin=-1, color='r', lw=2)
plt.text(mape_total, -30, f"{mape_total:.2f}\n(mean)", ha='center', color='r')

if save_plots:
    fig.savefig("plots/company_mape.pdf", bbox_inches="tight")

plt.title("Distribution of Company-wise MAPE (Test Set)")
plt.tight_layout()
plt.show()

id_security=448 : MAPE=0.04
id_security=570 : MAPE=0.04
id_security=581 : MAPE=0.05
id_security=694 : MAPE=0.04
id_security=768 : MAPE=0.04
id_security=876 : MAPE=0.04
id_security=879 : MAPE=0.04
id_security=897 : MAPE=0.04
id_security=1177 : MAPE=0.04
id_security=1273 : MAPE=-0.05
id_security=1498 : MAPE=0.05
id_security=1595 : MAPE=0.05
id_security=1648 : MAPE=0.04
id_security=1672 : MAPE=0.05
id_security=1682 : MAPE=0.04

def forecast_for_company_at_date(id_security, as_of_date):
    
    as_of_date = pd.to_datetime(as_of_date)

    # Split into training and prediction set
    train_subset = train_df[train_df['as_of_date'] < as_of_date].copy()
    as_of_date = train_subset['as_of_date'].max() # get an as_of_date in the dataset
    test_snapshot = train_df[(train_df['id_security'] == id_security) & (train_df['as_of_date'] == as_of_date)].copy()

    if test_snapshot.empty:
        print(f"No data available for company {id_security} at {as_of_date.date()}")
        return None, None

    # Prepare inputs
    X_train = train_subset[lgb_features].copy()
    y_train = train_subset[target_cols].values

    # Fit LightGBM model
    model = MultiOutputRegressor(lgb.LGBMRegressor(**lgb_params))
    model.fit(X_train, y_train, categorical_feature=categorical_lgb)
    
    X_test = test_snapshot[lgb_features].copy()
    y_pred = model.predict(X_test)[0]

    # Actual values
    y_true = test_snapshot[target_cols].values[0]

    # if len(future_quarters) < 4:
    #     future_quarters = np.append(future_quarters, [np.nan] * (4 - len(future_quarters)))

    # 6. Display
    print(f"\nForecast for Company {id_security} at {as_of_date.date()}:")
    for i in range(4):
        print(f"  Quarter +{i+1}: Predicted = {y_pred[i]:.2f}, Actual = {y_true[i]:.2f}")
    print()

    return y_pred, y_true

# Some examples
_ = forecast_for_company_at_date(2, '2015-01-01')
_ = forecast_for_company_at_date(95, '2013-06-25')  # model is very accurate for id 95!

Forecast for Company 2 at 2014-12-31:
  Quarter +1: Predicted = 158.88, Actual = 138.76
  Quarter +2: Predicted = 168.61, Actual = 188.66
  Quarter +3: Predicted = 206.85, Actual = 225.61
  Quarter +4: Predicted = 214.46, Actual = 172.87


Forecast for Company 95 at 2013-05-31:
  Quarter +1: Predicted = 357.97, Actual = 354.05
  Quarter +2: Predicted = 360.65, Actual = 371.90
  Quarter +3: Predicted = 377.98, Actual = 410.56
  Quarter +4: Predicted = 388.36, Actual = 392.17

	id_security_mapped	GICS_code	GICS_code_prefix
2177020	1803	10101020	10
1589811	1227	45203010	45
670560	1485	40203020	40
528096	457	25301040	25
123455	837	20201030	20

	date	CPIAUCSL_lag1	CPIAUCSL_lag2	UNRATE_lag1	UNRATE_lag2	DCOILWTICO_lag1	DCOILWTICO_lag2	GDPC1_lag1	GDPC1_lag2	DTWEXM_lag1	DTWEXM_lag2
0	1990-03-31	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	1990-06-30	128.033333	NaN	5.300000	NaN	21.777812	NaN	10047.386	NaN	93.096053	NaN
2	1990-09-30	129.300000	128.033333	5.333333	5.300000	17.769841	21.777812	10083.855	10047.386	93.436966	93.096053
3	1990-12-31	131.533333	129.300000	5.700000	5.333333	26.218615	17.769841	10090.569	10083.855	88.592227	93.436966
4	1991-03-31	133.766667	131.533333	6.133333	5.700000	32.089846	26.218615	9998.704	10090.569	84.429590	88.592227

Revenue Forecasting using Machine Learning¶

Simon Guichandut - May 2025¶

Understanding `as_of_date` vs. `datadate`¶

Baseline model: Moving Average¶

Feature selection for regression models¶

Lagged features¶

Categorical features¶

Macroeconomic features¶

Missing values¶

Time-based train/validation splits¶

Build the training dataset¶

Training and evaluating the models¶

Final performance evaluation on the test set¶

Make new predictions¶

Revenue Forecasting using Machine Learning¶

Simon Guichandut - May 2025¶

Understanding as_of_date vs. datadate¶

Baseline model: Moving Average¶

Feature selection for regression models¶

Lagged features¶

Categorical features¶

Macroeconomic features¶

Missing values¶

Time-based train/validation splits¶

Build the training dataset¶

Training and evaluating the models¶

Final performance evaluation on the test set¶

Make new predictions¶

Understanding `as_of_date` vs. `datadate`¶