6 Model

Author

Ben Koshy

7 Model

7.1 Setup

Required Packages

Code

#install.packages("rvest")
#install.packages("stringr")
#install.packages("tidyverse")
#install.packages("janitor")
#install.packages("gt")
#install.packages("reactable")
#install.packages("ggplot2")
#install.packages("ggrepel")
#install.packages("dpylr")
#install.packages(caret)
#install.packages(pROC)
#install.packages(rpart)
#install.packages(randomForest)
#install.packages(xgboost)
#install.packages(gt)

library(rvest)
library(stringr)
library(tidyverse)
library(janitor)
library(gt)
library(reactable)
library(ggplot2)
library(ggrepel)
library(dplyr)
library(caret)
library(pROC)
library(rpart)
library(randomForest)
library(xgboost)

7.2 Introduction and Initial Steps

With some new new found insight from our EDA, we can start developing potential models to work on predicting run play success. We first will try and fit an adequate regression model to the data and use that as a baseline to make predictions. Below we will start exploring potential models.

We first set up our space for predictability and prep the data. We removing NAs at this point to ensure there are

Code

set.seed(468) # For reproducibility

model_data <- run_plays |>
  select(play_id, successful_run, down, yards_to_go, absolute_yardline_number,
         offense_formation, rush_location_type, expected_points, score_diff,
         pff_run_concept_primary) |>
  filter(if_all(everything(), ~ !is.na(.))) |>
  mutate(
    successful_run = factor(successful_run, levels = c(FALSE, TRUE), labels = c("0", "1")),
    down = as.factor(down),
    offense_formation = as.factor(offense_formation),
    rush_location_type = as.factor(rush_location_type),
    pff_run_concept_primary = as.factor(pff_run_concept_primary)
  )

Next we split our training and testing data.

Code

set.seed(468)
train_idx <- createDataPartition(model_data$successful_run, p = 0.7, list = FALSE)
train <- model_data[train_idx, ]
test  <- model_data[-train_idx, ]

7.3 Fitting Potential Models and Evaluations

First we fit a logistic regression.

Code

glm_fit <- glm(successful_run ~ down + yards_to_go + absolute_yardline_number +
                 offense_formation + rush_location_type +
                 expected_points + score_diff + pff_run_concept_primary,
               data = train, family = binomial)
glm_pred <- predict(glm_fit, newdata = test, type = "response")
glm_prob <- glm_pred
glm_pred_label <- ifelse(glm_pred > 0.5, "1", "0")

Then we try a decision tree.

Code

tree_fit <- rpart(successful_run ~ down + yards_to_go + absolute_yardline_number +
                    offense_formation + rush_location_type +
                    expected_points + score_diff + pff_run_concept_primary,
                  data = train, method = "class", cp = 0.01)
tree_pred_prob <- predict(tree_fit, newdata = test, type = "prob")[,"1"]
tree_pred_label <- predict(tree_fit, newdata = test, type = "class") # returns "0" or "1"

Fitting for random forest.

Code

rf_fit <- randomForest(successful_run ~ down + yards_to_go + absolute_yardline_number +
                        offense_formation + rush_location_type +
                        expected_points + score_diff + pff_run_concept_primary,
                      data = train, ntree = 100, mtry = 3, importance = TRUE)
rf_pred_prob <- predict(rf_fit, newdata = test, type = "prob")[,"1"]
rf_pred_label <- predict(rf_fit, newdata = test, type = "response") # returns "0" or "1"

WE fit a XGBoost next.

Code

xg_train_label <- as.integer(as.character(train$successful_run))
xg_test_label  <- as.integer(as.character(test$successful_run))

train_mm <- model.matrix(~ down + yards_to_go + absolute_yardline_number +
                          offense_formation + rush_location_type +
                          expected_points + score_diff + pff_run_concept_primary,
                        data = train)[,-1]
test_mm  <- model.matrix(~ down + yards_to_go + absolute_yardline_number +
                         offense_formation + rush_location_type +
                         expected_points + score_diff + pff_run_concept_primary,
                        data = test)[,-1]

dtrain <- xgb.DMatrix(data = train_mm, label = xg_train_label)
dtest  <- xgb.DMatrix(data = test_mm,  label = xg_test_label)
xgb_fit <- xgboost(data = dtrain, objective = "binary:logistic", nrounds = 100, verbose = 0)
xgb_pred <- predict(xgb_fit, newdata = dtest)
xgb_pred_label <- ifelse(xgb_pred > 0.5, 1, 0)

We create a metric retrieval function to collect the metrics for each fit and then compile results for comparison.

Code

get_metrics <- function(truth, prob, pred_label) {
  levels_used <- c("0", "1")
  truth_f <- factor(as.character(truth), levels = levels_used)
  pred_label_f <- factor(as.character(pred_label), levels = levels_used)
  acc <- mean(pred_label_f == truth_f)
  # Convert AUC from S3 <auc> object to numeric
  auc <- as.numeric(pROC::roc(as.numeric(truth_f), as.numeric(prob))$auc)
  pr  <- caret::precision(truth_f, pred_label_f)
  rec <- caret::recall(truth_f, pred_label_f)
  f1  <- caret::F_meas(truth_f, pred_label_f)
  tibble(Accuracy = acc, ROC_AUC = auc, Precision = pr, Recall = rec, F1 = f1)
}

res_glm   <- get_metrics(test$successful_run, glm_prob, glm_pred_label)
res_tree  <- get_metrics(test$successful_run, tree_pred_prob, tree_pred_label)
res_rf    <- get_metrics(test$successful_run, rf_pred_prob, rf_pred_label)
res_xgb   <- get_metrics(xg_test_label, xgb_pred, xgb_pred_label) # for xgboost, label must be 0/1

results <- bind_rows(
    Logistic_Regression = res_glm,
    Decision_Tree       = res_tree,
    Random_Forest       = res_rf,
    XGBoost             = res_xgb,
    .id = "Model"
)

results |>
  mutate(across(where(is.numeric), round, 3)) |>
  gt() |>
  tab_header(title = "Model Comparison for Predicting Run Success")

Model Comparison for Predicting Run Success
Model	Accuracy	ROC_AUC	Precision	Recall	F1
Logistic_Regression	0.584	0.624	0.517	0.559	0.537
Decision_Tree	0.577	0.611	0.702	0.536	0.608
Random_Forest	0.563	0.593	0.493	0.534	0.513
XGBoost	0.556	0.588	0.484	0.527	0.505

Some theory to cover here:

Accuracy: proportion of all predictions that are correct ( \(\frac{TP + TN}{TP + TN +FP + FN}\))
- Higher value suggests model predicts more accurately.
  - TP - True Positive
  - FP - False Positive
  - TN - True Negative
  - FN - False Negative
ROC AUC (Receiver Operating Characteristics Area Under Curve): measures model’s model discrimination.
- Close to 1 means the model has better discrimination and near 0.5 suggests the model is no better than random chance
Precision: Proportion of actual successes to predicted successes (\(\frac{TP}{TP + FP}\))
- High precision means fewer false positives
  - TP - True Positive
  - FP - False Positive
Recall: Proportion of correctly identifies runs as successes of actual successful runs (\(\frac{TP}{TP + FN}\))
- High recall means there are fewer false negatives
  - TP - True Positive
  - FN - False Negative
F1: Measure of balance between precision and recall (\(2*\frac{Precision * Recall}{Precision + Recall}\)).
- The higher the better

From the table summary we observe that logistic regression has the best overall accuracy (0.584) and ROC AUC (0.624). It offers decent balance of precision (0.517) and recall (0.559), with moderate F1 (0.537). The decision tree had slightly lower accuracy/AUC, but notably higher precision (0.702)—however, this comes with much lower recall. Its F1 (0.608) is highest, indicating the best relationship of precision/recall. For the random forest & XGBoost: Slightly lower on all metrics compared to logistic regression and decision tree. Precision/recall/F1 are all moderate. More complex models did not outperform these baselines, suggesting that the chosen features and the inherent unpredictability of run success favor simpler, interpretable approaches.

Code

# Random Forest
rf_imp <- as.data.frame(importance(rf_fit))
rf_imp$Variable <- rownames(rf_imp)
rf_imp |>
  arrange(desc(MeanDecreaseGini)) |>
  ggplot(aes(x = reorder(Variable, MeanDecreaseGini), y = MeanDecreaseGini)) +
  geom_col(fill = "#D50A0A", color = "#ffffff", alpha = 0.9) +
  coord_flip() +
  labs(title = "Random Forest Variable Importance", x = "Feature", y = "Mean Decrease in Gini") +
  theme_minimal(base_size = 15)

Code

# XGBoost
xgb_imp <- xgb.importance(model = xgb_fit)
xgb_imp |>
  top_n(15, Gain) |>
  ggplot(aes(x = reorder(Feature, Gain), y = Gain)) +
  geom_col(fill = "#D50A0A", color = "#ffffff", alpha = 0.9) +
  coord_flip() +
  labs(title = "XGBoost Variable Importance", x = "Feature", y = "Gain") +
  theme_minimal(base_size = 15)

It would be worthwhile to look at the ROC curves.

Code

#ROC Curves (Optional)
glm_roc <- pROC::roc(as.numeric(as.character(test$successful_run)), as.numeric(glm_prob))
tree_roc <- pROC::roc(as.numeric(as.character(test$successful_run)), as.numeric(tree_pred_prob))
rf_roc <- pROC::roc(as.numeric(as.character(test$successful_run)), as.numeric(rf_pred_prob))
xgb_roc <- pROC::roc(xg_test_label, xgb_pred)
plot(glm_roc, col = "#D50A0A", lwd = 2, main = "ROC Curves (Test Set)"); grid()
lines(tree_roc, col = "#4392f1", lwd = 2)
lines(rf_roc, col = "#D50A0A", lwd = 2)
lines(xgb_roc, col = "#232323", lwd = 2)
legend("bottomright", legend = c("Logistic Regression","Tree","Random Forest","XGBoost"),
       col = c("#D50A0A","#4392f1","#D50A0A","#232323"), lwd=2, bty="n")

7.4 Moving Forward with Logistic Regression

We will move forward with our logistic regression model. We start by reintroducing our glm model fit:

Code

glm_fit <- glm(successful_run ~ down + yards_to_go + absolute_yardline_number +
                 offense_formation + rush_location_type +
                 expected_points + score_diff + pff_run_concept_primary,
               data = train, family = binomial)
summary(glm_fit)


Call:
glm(formula = successful_run ~ down + yards_to_go + absolute_yardline_number + 
    offense_formation + rush_location_type + expected_points + 
    score_diff + pff_run_concept_primary, family = binomial, 
    data = train)

Coefficients:
                                      Estimate Std. Error z value Pr(>|z|)    
(Intercept)                          1.582e+00  4.545e-01   3.480 0.000502 ***
down2                               -3.363e-01  8.412e-02  -3.998 6.38e-05 ***
down3                               -1.203e+00  1.462e-01  -8.229  < 2e-16 ***
down4                               -1.401e+00  3.045e-01  -4.601 4.21e-06 ***
yards_to_go                         -1.812e-01  1.385e-02 -13.083  < 2e-16 ***
absolute_yardline_number            -6.203e-04  1.270e-03  -0.488 0.625350    
offense_formationI_FORM              4.738e-01  4.096e-01   1.157 0.247452    
offense_formationJUMBO              -1.582e-01  4.694e-01  -0.337 0.736076    
offense_formationPISTOL              3.809e-01  4.158e-01   0.916 0.359606    
offense_formationSHOTGUN             4.765e-01  4.003e-01   1.190 0.233891    
offense_formationSINGLEBACK          2.687e-01  4.032e-01   0.667 0.505065    
offense_formationWILDCAT             6.593e-01  5.065e-01   1.302 0.193049    
rush_location_typeINSIDE_RIGHT       6.047e-02  8.318e-02   0.727 0.467213    
rush_location_typeOUTSIDE_LEFT      -3.005e-01  9.264e-02  -3.244 0.001179 ** 
rush_location_typeOUTSIDE_RIGHT     -1.852e-01  8.983e-02  -2.061 0.039280 *  
expected_points                     -5.380e-02  2.120e-02  -2.538 0.011149 *  
score_diff                          -3.469e-05  3.507e-03  -0.010 0.992109    
pff_run_concept_primaryDRAW          3.570e-01  2.282e-01   1.564 0.117767    
pff_run_concept_primaryFB RUN        1.939e-01  4.854e-01   0.399 0.689588    
pff_run_concept_primaryINSIDE ZONE   7.000e-02  1.422e-01   0.492 0.622637    
pff_run_concept_primaryMAN           1.646e-01  1.493e-01   1.102 0.270266    
pff_run_concept_primaryOUTSIDE ZONE -5.060e-02  1.395e-01  -0.363 0.716771    
pff_run_concept_primaryPOWER         2.930e-01  1.599e-01   1.832 0.066920 .  
pff_run_concept_primaryPULL LEAD     3.906e-02  1.644e-01   0.238 0.812257    
pff_run_concept_primarySNEAK         1.787e+00  3.949e-01   4.526 6.00e-06 ***
pff_run_concept_primaryTRAP          4.017e-01  2.651e-01   1.515 0.129737    
pff_run_concept_primaryTRICK         6.577e-01  2.189e-01   3.005 0.002657 ** 
pff_run_concept_primaryUNDEFINED    -1.488e-01  3.676e-01  -0.405 0.685652    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 6123.3  on 4430  degrees of freedom
Residual deviance: 5787.2  on 4403  degrees of freedom
AIC: 5843.2

Number of Fisher Scoring iterations: 4

Code

library(broom)
tidy(glm_fit) |>
  mutate(
    odds_ratio = exp(estimate)
  ) |>
  select(term, estimate, odds_ratio, std.error, p.value) |>
  arrange(desc(abs(estimate))) |>
  gt::gt() |>
  gt::fmt_number(columns = c(estimate, odds_ratio, std.error), decimals = 3) |>
  gt::tab_header(title = "Logistic Regression Coefficient Estimates and Odds Ratios")

Logistic Regression Coefficient Estimates and Odds Ratios
term	estimate	odds_ratio	std.error	p.value
pff_run_concept_primarySNEAK	1.787	5.973	0.395	6.003280e-06
(Intercept)	1.582	4.862	0.454	5.017038e-04
down4	−1.401	0.246	0.304	4.205369e-06
down3	−1.203	0.300	0.146	1.884548e-16
offense_formationWILDCAT	0.659	1.933	0.506	1.930492e-01
pff_run_concept_primaryTRICK	0.658	1.930	0.219	2.657282e-03
offense_formationSHOTGUN	0.476	1.610	0.400	2.338907e-01
offense_formationI_FORM	0.474	1.606	0.410	2.474517e-01
pff_run_concept_primaryTRAP	0.402	1.494	0.265	1.297365e-01
offense_formationPISTOL	0.381	1.464	0.416	3.596058e-01
pff_run_concept_primaryDRAW	0.357	1.429	0.228	1.177673e-01
down2	−0.336	0.714	0.084	6.384988e-05
rush_location_typeOUTSIDE_LEFT	−0.301	0.740	0.093	1.178895e-03
pff_run_concept_primaryPOWER	0.293	1.340	0.160	6.691976e-02
offense_formationSINGLEBACK	0.269	1.308	0.403	5.050648e-01
pff_run_concept_primaryFB RUN	0.194	1.214	0.485	6.895876e-01
rush_location_typeOUTSIDE_RIGHT	−0.185	0.831	0.090	3.927991e-02
yards_to_go	−0.181	0.834	0.014	4.099729e-39
pff_run_concept_primaryMAN	0.165	1.179	0.149	2.702658e-01
offense_formationJUMBO	−0.158	0.854	0.469	7.360764e-01
pff_run_concept_primaryUNDEFINED	−0.149	0.862	0.368	6.856518e-01
pff_run_concept_primaryINSIDE ZONE	0.070	1.073	0.142	6.226370e-01
rush_location_typeINSIDE_RIGHT	0.060	1.062	0.083	4.672130e-01
expected_points	−0.054	0.948	0.021	1.114862e-02
pff_run_concept_primaryOUTSIDE ZONE	−0.051	0.951	0.139	7.167712e-01
pff_run_concept_primaryPULL LEAD	0.039	1.040	0.164	8.122569e-01
absolute_yardline_number	−0.001	0.999	0.001	6.253502e-01
score_diff	0.000	1.000	0.004	9.921090e-01

Model predictions:

Code

glm_pred <- predict(glm_fit, newdata = test, type = "response")
glm_pred_label <- ifelse(glm_pred > 0.5, "1", "0")
library(pROC)
roc_curve <- roc(test$successful_run, glm_pred)
plot(roc_curve, col = "#D50A0A", lwd = 2, main = "Logistic Regression ROC Curve")

Code

auc(roc_curve)

Area under the curve: 0.6244

We fit the final logistic regression model (with just the training fold):

Code

glm_fit <- glm(successful_run ~ down + yards_to_go + absolute_yardline_number +
                 offense_formation + rush_location_type +
                 expected_points + score_diff + pff_run_concept_primary,
               data = train, family = binomial)

Below for reference we fit with the full set of model data:

Code

glm_fit_full <- glm(successful_run ~ down + yards_to_go + absolute_yardline_number +
                      offense_formation + rush_location_type +
                      expected_points + score_diff + pff_run_concept_primary,
                    data = model_data, family = binomial)
model_data <- model_data |>
  mutate(pred_prob = predict(glm_fit_full, newdata = model_data, type = "response"))

7.5 Model Storage

Now that we have our model of choice, we need to store it for later access.

Libraries needed for this:

Code

#install.packages("pins")
#install.packages("vetiver")
#install.packages("plumber")
#install.packages("aws.s3")
#install.packages("DBI")
#install.packages("duckdb")
#install.packages("paws")

library(pins)
library(vetiver)
library(plumber)
library(aws.s3)
library(DBI)
library(duckdb)

Store model data and predicted values. glm_fit_full is our model, so that is what we will set into Vetiver to store our model.

Code

library(pins)
library(vetiver)
library(plumber)
library(aws.s3)
library(arrow)


Attaching package: 'arrow'

The following object is masked from 'package:lubridate':

    duration

The following object is masked from 'package:utils':

    timestamp

Code

library(paws)
Sys.setenv("AWS_ACCESS_KEY_ID" = Sys.getenv("AWS_ACCESS_KEY_ID"),
           "AWS_SECRET_ACCESS_KEY" = Sys.getenv("AWS_SECRET_ACCESS_KEY"),
           "AWS_DEFAULT_REGION" = "us-east-2")

predictions <- model_data |>
  select(play_id, pred_prob)

bucket = "bkoshy-bdb-rushing"

s3write_using(predictions, 
              FUN = write_parquet, 
              bucket = bucket, object = "predictions_glm.parquet")

board <- board_s3(bucket = bucket) 

v <- vetiver_model(glm_fit_full, "glm_model")
vetiver_pin_write(board, v)

Creating new version '20250811T230948Z-5141c'

Writing to pin 'glm_model'

Create a Model Card for your published model
• Model Cards provide a framework for transparent, responsible reporting
• Use the vetiver `.Rmd` template as a place to start

Interface with the model:

Creating Dockerfile:

Code

vetiver_prepare_docker(
    board, 
    "glm_model", 
    docker_args = list(port = 8080)
)

- The lockfile is already up to date.