6  Model

Author

Ben Koshy

7 Model

7.1 Setup

Required Packages

Code
#install.packages("rvest")
#install.packages("stringr")
#install.packages("tidyverse")
#install.packages("janitor")
#install.packages("gt")
#install.packages("reactable")
#install.packages("ggplot2")
#install.packages("ggrepel")
#install.packages("dpylr")
#install.packages(caret)
#install.packages(pROC)
#install.packages(rpart)
#install.packages(randomForest)
#install.packages(xgboost)
#install.packages(gt)

library(rvest)
library(stringr)
library(tidyverse)
library(janitor)
library(gt)
library(reactable)
library(ggplot2)
library(ggrepel)
library(dplyr)
library(caret)
library(pROC)
library(rpart)
library(randomForest)
library(xgboost)

7.2 Introduction and Initial Steps

With some new new found insight from our EDA, we can start developing potential models to work on predicting run play success. We first will try and fit an adequate regression model to the data and use that as a baseline to make predictions. Below we will start exploring potential models.

We first set up our space for predictability and prep the data. We removing NAs at this point to ensure there are

Code
set.seed(468) # For reproducibility

model_data <- run_plays |>
  select(play_id, successful_run, down, yards_to_go, absolute_yardline_number,
         offense_formation, rush_location_type, expected_points, score_diff,
         pff_run_concept_primary) |>
  filter(if_all(everything(), ~ !is.na(.))) |>
  mutate(
    successful_run = factor(successful_run, levels = c(FALSE, TRUE), labels = c("0", "1")),
    down = as.factor(down),
    offense_formation = as.factor(offense_formation),
    rush_location_type = as.factor(rush_location_type),
    pff_run_concept_primary = as.factor(pff_run_concept_primary)
  )

Next we split our training and testing data.

Code
set.seed(468)
train_idx <- createDataPartition(model_data$successful_run, p = 0.7, list = FALSE)
train <- model_data[train_idx, ]
test  <- model_data[-train_idx, ]

7.3 Fitting Potential Models and Evaluations

First we fit a logistic regression.

Code
glm_fit <- glm(successful_run ~ down + yards_to_go + absolute_yardline_number +
                 offense_formation + rush_location_type +
                 expected_points + score_diff + pff_run_concept_primary,
               data = train, family = binomial)
glm_pred <- predict(glm_fit, newdata = test, type = "response")
glm_prob <- glm_pred
glm_pred_label <- ifelse(glm_pred > 0.5, "1", "0")

Then we try a decision tree.

Code
tree_fit <- rpart(successful_run ~ down + yards_to_go + absolute_yardline_number +
                    offense_formation + rush_location_type +
                    expected_points + score_diff + pff_run_concept_primary,
                  data = train, method = "class", cp = 0.01)
tree_pred_prob <- predict(tree_fit, newdata = test, type = "prob")[,"1"]
tree_pred_label <- predict(tree_fit, newdata = test, type = "class") # returns "0" or "1"

Fitting for random forest.

Code
rf_fit <- randomForest(successful_run ~ down + yards_to_go + absolute_yardline_number +
                        offense_formation + rush_location_type +
                        expected_points + score_diff + pff_run_concept_primary,
                      data = train, ntree = 100, mtry = 3, importance = TRUE)
rf_pred_prob <- predict(rf_fit, newdata = test, type = "prob")[,"1"]
rf_pred_label <- predict(rf_fit, newdata = test, type = "response") # returns "0" or "1"

WE fit a XGBoost next.

Code
xg_train_label <- as.integer(as.character(train$successful_run))
xg_test_label  <- as.integer(as.character(test$successful_run))

train_mm <- model.matrix(~ down + yards_to_go + absolute_yardline_number +
                          offense_formation + rush_location_type +
                          expected_points + score_diff + pff_run_concept_primary,
                        data = train)[,-1]
test_mm  <- model.matrix(~ down + yards_to_go + absolute_yardline_number +
                         offense_formation + rush_location_type +
                         expected_points + score_diff + pff_run_concept_primary,
                        data = test)[,-1]

dtrain <- xgb.DMatrix(data = train_mm, label = xg_train_label)
dtest  <- xgb.DMatrix(data = test_mm,  label = xg_test_label)
xgb_fit <- xgboost(data = dtrain, objective = "binary:logistic", nrounds = 100, verbose = 0)
xgb_pred <- predict(xgb_fit, newdata = dtest)
xgb_pred_label <- ifelse(xgb_pred > 0.5, 1, 0)

We create a metric retrieval function to collect the metrics for each fit and then compile results for comparison.

Code
get_metrics <- function(truth, prob, pred_label) {
  levels_used <- c("0", "1")
  truth_f <- factor(as.character(truth), levels = levels_used)
  pred_label_f <- factor(as.character(pred_label), levels = levels_used)
  acc <- mean(pred_label_f == truth_f)
  # Convert AUC from S3 <auc> object to numeric
  auc <- as.numeric(pROC::roc(as.numeric(truth_f), as.numeric(prob))$auc)
  pr  <- caret::precision(truth_f, pred_label_f)
  rec <- caret::recall(truth_f, pred_label_f)
  f1  <- caret::F_meas(truth_f, pred_label_f)
  tibble(Accuracy = acc, ROC_AUC = auc, Precision = pr, Recall = rec, F1 = f1)
}

res_glm   <- get_metrics(test$successful_run, glm_prob, glm_pred_label)
res_tree  <- get_metrics(test$successful_run, tree_pred_prob, tree_pred_label)
res_rf    <- get_metrics(test$successful_run, rf_pred_prob, rf_pred_label)
res_xgb   <- get_metrics(xg_test_label, xgb_pred, xgb_pred_label) # for xgboost, label must be 0/1

results <- bind_rows(
    Logistic_Regression = res_glm,
    Decision_Tree       = res_tree,
    Random_Forest       = res_rf,
    XGBoost             = res_xgb,
    .id = "Model"
)

results |>
  mutate(across(where(is.numeric), round, 3)) |>
  gt() |>
  tab_header(title = "Model Comparison for Predicting Run Success")
Model Comparison for Predicting Run Success
Model Accuracy ROC_AUC Precision Recall F1
Logistic_Regression 0.584 0.624 0.517 0.559 0.537
Decision_Tree 0.577 0.611 0.702 0.536 0.608
Random_Forest 0.563 0.593 0.493 0.534 0.513
XGBoost 0.556 0.588 0.484 0.527 0.505

Some theory to cover here:

  • Accuracy: proportion of all predictions that are correct ( \(\frac{TP + TN}{TP + TN +FP + FN}\))

    • Higher value suggests model predicts more accurately.

      • TP - True Positive

      • FP - False Positive

      • TN - True Negative

      • FN - False Negative

  • ROC AUC (Receiver Operating Characteristics Area Under Curve): measures model’s model discrimination.

    • Close to 1 means the model has better discrimination and near 0.5 suggests the model is no better than random chance
  • Precision: Proportion of actual successes to predicted successes (\(\frac{TP}{TP + FP}\))

    • High precision means fewer false positives

      • TP - True Positive

      • FP - False Positive

  • Recall: Proportion of correctly identifies runs as successes of actual successful runs (\(\frac{TP}{TP + FN}\))

    • High recall means there are fewer false negatives

      • TP - True Positive

      • FN - False Negative

  • F1: Measure of balance between precision and recall (\(2*\frac{Precision * Recall}{Precision + Recall}\)).

    • The higher the better

From the table summary we observe that logistic regression has the best overall accuracy (0.584) and ROC AUC (0.624). It offers decent balance of precision (0.517) and recall (0.559), with moderate F1 (0.537). The decision tree had slightly lower accuracy/AUC, but notably higher precision (0.702)—however, this comes with much lower recall. Its F1 (0.608) is highest, indicating the best relationship of precision/recall. For the random forest & XGBoost: Slightly lower on all metrics compared to logistic regression and decision tree. Precision/recall/F1 are all moderate. More complex models did not outperform these baselines, suggesting that the chosen features and the inherent unpredictability of run success favor simpler, interpretable approaches.

Code
# Random Forest
rf_imp <- as.data.frame(importance(rf_fit))
rf_imp$Variable <- rownames(rf_imp)
rf_imp |>
  arrange(desc(MeanDecreaseGini)) |>
  ggplot(aes(x = reorder(Variable, MeanDecreaseGini), y = MeanDecreaseGini)) +
  geom_col(fill = "#D50A0A", color = "#ffffff", alpha = 0.9) +
  coord_flip() +
  labs(title = "Random Forest Variable Importance", x = "Feature", y = "Mean Decrease in Gini") +
  theme_minimal(base_size = 15)

Code
# XGBoost
xgb_imp <- xgb.importance(model = xgb_fit)
xgb_imp |>
  top_n(15, Gain) |>
  ggplot(aes(x = reorder(Feature, Gain), y = Gain)) +
  geom_col(fill = "#D50A0A", color = "#ffffff", alpha = 0.9) +
  coord_flip() +
  labs(title = "XGBoost Variable Importance", x = "Feature", y = "Gain") +
  theme_minimal(base_size = 15) 

It would be worthwhile to look at the ROC curves.

Code
#ROC Curves (Optional)
glm_roc <- pROC::roc(as.numeric(as.character(test$successful_run)), as.numeric(glm_prob))
tree_roc <- pROC::roc(as.numeric(as.character(test$successful_run)), as.numeric(tree_pred_prob))
rf_roc <- pROC::roc(as.numeric(as.character(test$successful_run)), as.numeric(rf_pred_prob))
xgb_roc <- pROC::roc(xg_test_label, xgb_pred)
plot(glm_roc, col = "#D50A0A", lwd = 2, main = "ROC Curves (Test Set)"); grid()
lines(tree_roc, col = "#4392f1", lwd = 2)
lines(rf_roc, col = "#D50A0A", lwd = 2)
lines(xgb_roc, col = "#232323", lwd = 2)
legend("bottomright", legend = c("Logistic Regression","Tree","Random Forest","XGBoost"),
       col = c("#D50A0A","#4392f1","#D50A0A","#232323"), lwd=2, bty="n")

7.4 Moving Forward with Logistic Regression

We will move forward with our logistic regression model. We start by reintroducing our glm model fit:

Code
glm_fit <- glm(successful_run ~ down + yards_to_go + absolute_yardline_number +
                 offense_formation + rush_location_type +
                 expected_points + score_diff + pff_run_concept_primary,
               data = train, family = binomial)
summary(glm_fit)

Call:
glm(formula = successful_run ~ down + yards_to_go + absolute_yardline_number + 
    offense_formation + rush_location_type + expected_points + 
    score_diff + pff_run_concept_primary, family = binomial, 
    data = train)

Coefficients:
                                      Estimate Std. Error z value Pr(>|z|)    
(Intercept)                          1.582e+00  4.545e-01   3.480 0.000502 ***
down2                               -3.363e-01  8.412e-02  -3.998 6.38e-05 ***
down3                               -1.203e+00  1.462e-01  -8.229  < 2e-16 ***
down4                               -1.401e+00  3.045e-01  -4.601 4.21e-06 ***
yards_to_go                         -1.812e-01  1.385e-02 -13.083  < 2e-16 ***
absolute_yardline_number            -6.203e-04  1.270e-03  -0.488 0.625350    
offense_formationI_FORM              4.738e-01  4.096e-01   1.157 0.247452    
offense_formationJUMBO              -1.582e-01  4.694e-01  -0.337 0.736076    
offense_formationPISTOL              3.809e-01  4.158e-01   0.916 0.359606    
offense_formationSHOTGUN             4.765e-01  4.003e-01   1.190 0.233891    
offense_formationSINGLEBACK          2.687e-01  4.032e-01   0.667 0.505065    
offense_formationWILDCAT             6.593e-01  5.065e-01   1.302 0.193049    
rush_location_typeINSIDE_RIGHT       6.047e-02  8.318e-02   0.727 0.467213    
rush_location_typeOUTSIDE_LEFT      -3.005e-01  9.264e-02  -3.244 0.001179 ** 
rush_location_typeOUTSIDE_RIGHT     -1.852e-01  8.983e-02  -2.061 0.039280 *  
expected_points                     -5.380e-02  2.120e-02  -2.538 0.011149 *  
score_diff                          -3.469e-05  3.507e-03  -0.010 0.992109    
pff_run_concept_primaryDRAW          3.570e-01  2.282e-01   1.564 0.117767    
pff_run_concept_primaryFB RUN        1.939e-01  4.854e-01   0.399 0.689588    
pff_run_concept_primaryINSIDE ZONE   7.000e-02  1.422e-01   0.492 0.622637    
pff_run_concept_primaryMAN           1.646e-01  1.493e-01   1.102 0.270266    
pff_run_concept_primaryOUTSIDE ZONE -5.060e-02  1.395e-01  -0.363 0.716771    
pff_run_concept_primaryPOWER         2.930e-01  1.599e-01   1.832 0.066920 .  
pff_run_concept_primaryPULL LEAD     3.906e-02  1.644e-01   0.238 0.812257    
pff_run_concept_primarySNEAK         1.787e+00  3.949e-01   4.526 6.00e-06 ***
pff_run_concept_primaryTRAP          4.017e-01  2.651e-01   1.515 0.129737    
pff_run_concept_primaryTRICK         6.577e-01  2.189e-01   3.005 0.002657 ** 
pff_run_concept_primaryUNDEFINED    -1.488e-01  3.676e-01  -0.405 0.685652    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 6123.3  on 4430  degrees of freedom
Residual deviance: 5787.2  on 4403  degrees of freedom
AIC: 5843.2

Number of Fisher Scoring iterations: 4
Code
library(broom)
tidy(glm_fit) |>
  mutate(
    odds_ratio = exp(estimate)
  ) |>
  select(term, estimate, odds_ratio, std.error, p.value) |>
  arrange(desc(abs(estimate))) |>
  gt::gt() |>
  gt::fmt_number(columns = c(estimate, odds_ratio, std.error), decimals = 3) |>
  gt::tab_header(title = "Logistic Regression Coefficient Estimates and Odds Ratios")
Logistic Regression Coefficient Estimates and Odds Ratios
term estimate odds_ratio std.error p.value
pff_run_concept_primarySNEAK 1.787 5.973 0.395 6.003280e-06
(Intercept) 1.582 4.862 0.454 5.017038e-04
down4 −1.401 0.246 0.304 4.205369e-06
down3 −1.203 0.300 0.146 1.884548e-16
offense_formationWILDCAT 0.659 1.933 0.506 1.930492e-01
pff_run_concept_primaryTRICK 0.658 1.930 0.219 2.657282e-03
offense_formationSHOTGUN 0.476 1.610 0.400 2.338907e-01
offense_formationI_FORM 0.474 1.606 0.410 2.474517e-01
pff_run_concept_primaryTRAP 0.402 1.494 0.265 1.297365e-01
offense_formationPISTOL 0.381 1.464 0.416 3.596058e-01
pff_run_concept_primaryDRAW 0.357 1.429 0.228 1.177673e-01
down2 −0.336 0.714 0.084 6.384988e-05
rush_location_typeOUTSIDE_LEFT −0.301 0.740 0.093 1.178895e-03
pff_run_concept_primaryPOWER 0.293 1.340 0.160 6.691976e-02
offense_formationSINGLEBACK 0.269 1.308 0.403 5.050648e-01
pff_run_concept_primaryFB RUN 0.194 1.214 0.485 6.895876e-01
rush_location_typeOUTSIDE_RIGHT −0.185 0.831 0.090 3.927991e-02
yards_to_go −0.181 0.834 0.014 4.099729e-39
pff_run_concept_primaryMAN 0.165 1.179 0.149 2.702658e-01
offense_formationJUMBO −0.158 0.854 0.469 7.360764e-01
pff_run_concept_primaryUNDEFINED −0.149 0.862 0.368 6.856518e-01
pff_run_concept_primaryINSIDE ZONE 0.070 1.073 0.142 6.226370e-01
rush_location_typeINSIDE_RIGHT 0.060 1.062 0.083 4.672130e-01
expected_points −0.054 0.948 0.021 1.114862e-02
pff_run_concept_primaryOUTSIDE ZONE −0.051 0.951 0.139 7.167712e-01
pff_run_concept_primaryPULL LEAD 0.039 1.040 0.164 8.122569e-01
absolute_yardline_number −0.001 0.999 0.001 6.253502e-01
score_diff 0.000 1.000 0.004 9.921090e-01

Model predictions:

Code
glm_pred <- predict(glm_fit, newdata = test, type = "response")
glm_pred_label <- ifelse(glm_pred > 0.5, "1", "0")
library(pROC)
roc_curve <- roc(test$successful_run, glm_pred)
plot(roc_curve, col = "#D50A0A", lwd = 2, main = "Logistic Regression ROC Curve")

Code
auc(roc_curve)
Area under the curve: 0.6244

We fit the final logistic regression model (with just the training fold):

Code
glm_fit <- glm(successful_run ~ down + yards_to_go + absolute_yardline_number +
                 offense_formation + rush_location_type +
                 expected_points + score_diff + pff_run_concept_primary,
               data = train, family = binomial)

Below for reference we fit with the full set of model data:

Code
glm_fit_full <- glm(successful_run ~ down + yards_to_go + absolute_yardline_number +
                      offense_formation + rush_location_type +
                      expected_points + score_diff + pff_run_concept_primary,
                    data = model_data, family = binomial)
model_data <- model_data |>
  mutate(pred_prob = predict(glm_fit_full, newdata = model_data, type = "response"))

7.5 Model Storage

Now that we have our model of choice, we need to store it for later access.

Libraries needed for this:

Code
#install.packages("pins")
#install.packages("vetiver")
#install.packages("plumber")
#install.packages("aws.s3")
#install.packages("DBI")
#install.packages("duckdb")
#install.packages("paws")

library(pins)
library(vetiver)
library(plumber)
library(aws.s3)
library(DBI)
library(duckdb)

Store model data and predicted values. glm_fit_full is our model, so that is what we will set into Vetiver to store our model.

Code
library(pins)
library(vetiver)
library(plumber)
library(aws.s3)
library(arrow)

Attaching package: 'arrow'
The following object is masked from 'package:lubridate':

    duration
The following object is masked from 'package:utils':

    timestamp
Code
library(paws)
Sys.setenv("AWS_ACCESS_KEY_ID" = Sys.getenv("AWS_ACCESS_KEY_ID"),
           "AWS_SECRET_ACCESS_KEY" = Sys.getenv("AWS_SECRET_ACCESS_KEY"),
           "AWS_DEFAULT_REGION" = "us-east-2")

predictions <- model_data |>
  select(play_id, pred_prob)

bucket = "bkoshy-bdb-rushing"

s3write_using(predictions, 
              FUN = write_parquet, 
              bucket = bucket, object = "predictions_glm.parquet")

board <- board_s3(bucket = bucket) 

v <- vetiver_model(glm_fit_full, "glm_model")
vetiver_pin_write(board, v)
Creating new version '20250811T230948Z-5141c'
Writing to pin 'glm_model'

Create a Model Card for your published model
• Model Cards provide a framework for transparent, responsible reporting
• Use the vetiver `.Rmd` template as a place to start

Interface with the model:

Creating Dockerfile:

Code
vetiver_prepare_docker(
    board, 
    "glm_model", 
    docker_args = list(port = 8080)
)
- The lockfile is already up to date.