Using Machine Learning to Predict 2023 Kentucky Derby Winning Race Times | Andrew Jocelyn | May 2023

Machine Learning


Photo by Keith Luke on Unsplash
# Load Libraries #
library(dplyr)
library(tidyr)
library(glmnet)
library(caret)
library(rpart)
library(gbm)
library(rsample)
library(plotly)
library(readr)
library(reticulate)

# Read in Data #
data <- read.csv("...KD Data.csv")

# Declare Year Variables #
year <- data[,1]

# Declare numeric x variables #
numeric <- data[,c(2,3,4)]

# Scale numeric x variables
scaled_x <- scale(numeric)
# check that we get mean of 0 and sd of 1
colMeans(scaled_x)
apply(scaled_x, 2, sd)

Make sure numeric data columns are scaled.
#Declare y variable #
y <- data[,6]

# One-Hot Encoding #
data$Weather <- as.factor(data$Weather)
xfactors <- model.matrix(data$Year ~ data$Weather)[, -1]

# Bring prepped data all back together #
scaled_df <- as.data.frame(cbind(year,y,scaled_x,xfactors))

# Isolate pre-2023 data #
old_data <- scaled_df[-1,]
new_data <- scaled_df[1,]

# Gradient Boosted Machine #
# Find Max Interaction Depth #
floor(sqrt(NCOL(old_data)))

GBM has a maximum depth of 3.
# Find Optimal n.trees #
tree_mod <- gbm(
formula = y ~ .,
distribution = "gaussian",
data = old_data,
shrinkage = 0.001, #Small dataset so small shrinkage
interaction.depth = 3, #Determined above
n.minobsinnode = 10, #Default
bag.fraction = 0.99, #Small dataset, so this has to be large
n.trees = 1000,
n.cores = NULL, # will use all cores by default
verbose = FALSE
)

# find index for n trees with minimum CV error
best.iter <- gbm.perf(tree_mod, method="OOB", plot.it=TRUE, oobag.curve=TRUE, overlay=TRUE)
print(best.iter)

Plot showing the optimal number of trees to reduce the OOB change in squared error loss.
500 trees is optimal for GBM.
# Full GBM Model #
GBM <- gbm(y ~ .,
distribution = "gaussian",
data = old_data,
n.trees = 500,
interaction.depth = 3,
shrinkage = 0.001,
n.minobsinnode = 10,
bag.fraction = 0.99,
train.fraction = 1,
n.cores = NULL,
verbose = FALSE
)
# 2023 Kentucky Derby Data #
GBM_Prediction <- predict(GBM, new_data,
n.trees = 500,
distribution = "gaussian",
shrinkage = 0.001,
interaction.depth = 3,
n.minobsinnode = 10,
bag.fraction = 0.99,
train.fraction = 1,
n.cores = NULL,
verbose = FALSE
)
The 2023 Kentucky Derby winning time is predicted to be 122.12 seconds or 2 minutes and 2.12 seconds.



Source link

Leave a Reply

Your email address will not be published. Required fields are marked *