<-read.csv("../../../data/00-raw-data/horses.csv")
horses<-read.csv("../../../data/00-raw-data/performances.csv") performances
Data Cleaning
This section was completed using R.
Let’s start by cleaning horses:
head(horses,2)
horse_id horse_name horse_name_zh
1 L247 REAL SUPREME 天外之天
2 44170 LIM'S INVINCIBLE 林家无敌
horse_url
1 http://racing.hkjc.com/racing/info/horse/GetFilteredHorseFormRecord/english/L247
2 http://www.turfclub.com.sg/Racing/Pages/HorseProfile.aspx?HorseID=44170
horse_country is_active country age trainer_id colour sex
1 HK TRUE AUS 9 CCW Chestnut Gelding
2 SG TRUE AUS 3 STEVEN HAROLD BURRIDGE
owner_name
1 Lai Ni Jan | Benvinda Lai
2 LIM'S STABLE
owner_link import_type
1 ownersearch.asp?horseowner=Lai Ni Jan | Benvinda Lai ISG
2
current_rating season_stake start_of_season_rating total_stake
1 34 37800 39 6607700
2 50 NA NA 38182
sire
1 Flying Spur
2 I AM INVINCIBLE (AUS)
same_sire_url
1 http://racing.hkjc.com/racing/info/Horse/SameSire/english/Rmx5aW5nIFNwdXIgICAgICAgICA=
2
no_of_1_2_3 dam no_of_start_past_10_meetings dam_sire
1 8-5-6-75 Irgunette 1 Irgun
2 REIGNING CASH (AUS) NA
same_sire last_updated
1 ["T387", "M273", "T280", "V334"] 08:18.0
2 11:32.8
We are going to remove “horse_name_zh” which contains the horses’ names in Cantonese, “horse_url”, “owner_url”, “sire”, “no_of_start_past_10_meetings”, “dam”,“dam_sire”, “same_sire and”last_updated” which are all irrelevant to our analysis.
<-horses[,-c(3,4,13,16,19,20,22,23,24,25,26)]
horses.cleanhead(horses.clean,2)
horse_id horse_name horse_country is_active country age
1 L247 REAL SUPREME HK TRUE AUS 9
2 44170 LIM'S INVINCIBLE SG TRUE AUS 3
trainer_id colour sex owner_name import_type
1 CCW Chestnut Gelding Lai Ni Jan | Benvinda Lai ISG
2 STEVEN HAROLD BURRIDGE LIM'S STABLE
current_rating start_of_season_rating total_stake no_of_1_2_3
1 34 39 6607700 8-5-6-75
2 50 NA 38182
We are only interested in horses that run in Hong Kong while there are some horses in this data set that raced in Singapore.
library(dplyr)
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
<-horses.clean%>%
horses.cleanfilter(horse_country == "HK")
Now we want to clean “no_of_1_2_3”. This column represents how many 1st, 2nd, and 3rd place finishes the horse has ever had. The last number is the total number of races the horse has participated in. We are going to break up this column into seperate columns for placing finishes and number of races.
library(stringr)
<-str_split_fixed(horses.clean$no_of_1_2_3, "-", 4)
split1 $first_place <- split1[,1]
horses.clean$second_place <- split1[,2]
horses.clean$third_place <- split1[,3]
horses.clean$races <- split1[,4]
horses.clean# Remove original column
<- horses.clean[,-15]
horses.clean head(horses.clean,2)
horse_id horse_name horse_country is_active country age trainer_id
1 L247 REAL SUPREME HK TRUE AUS 9 CCW
2 T422 AMAZING MOMENT HK TRUE AUS 6 OSP
colour sex owner_name import_type current_rating
1 Chestnut Gelding Lai Ni Jan | Benvinda Lai ISG 34
2 Bay Gelding Treinta Pares Syndicate PP 60
start_of_season_rating total_stake first_place second_place third_place races
1 39 6607700 8 5 6 75
2 57 1337225 1 1 4 19
Now let’s clean Performances
head(performances,2)
horse_id horse_name horse_name_zh horse_no race_id race_no race_date
1 34204 LORD GREG LORD GREG NA NA 0 2007-06-09
2 34204 LORD GREG LORD GREG NA NA 0 2007-07-01
race_country final_placing season going rating jockey_id jockey_name
1 SG 3 2006/2007 G 37 O CHAVEZ O CHAVEZ
2 SG 2 2006/2007 G 37 B VORSTER B VORSTER
jockey_name_zh
1 查维斯
2 沃斯特
jockey_url
1 http://www.turfclub.com.sg/Industry/Jockeys/Pages/JockeyDirectoryDetails.aspx?Name=O+CHAVEZ
2 http://www.turfclub.com.sg/Industry/Jockeys/Pages/JockeyDirectoryDetails.aspx?Name=B+VORSTER
trainer_id trainer_name trainer_name_zh
1 O CHAVEZ L LAXON 纳逊
2 B VORSTER L LAXON 纳逊
trainer_url
1 http://www.turfclub.com.sg/Industry/Trainers/Pages/TrainerDirectoryDetails.aspx?TrainerName=L+LAXON
2 http://www.turfclub.com.sg/Industry/Trainers/Pages/TrainerDirectoryDetails.aspx?TrainerName=L+LAXON
winning_odds actual_weight running_positions finish_time on_date_weight gears
1 2.6 53.5 7 7 3 1:24:50 497 B
2 2.4 54.0 4 3 2 0.076736111 500 B
draw length_behind_winner race_class distance course race_location race_url
1 6 6.8L 5 1400 S
2 2 3.8L 5 1800 S
track sections sections_time last_updated
1 Turf 00:41.5
2 Turf 00:41.5
We are going to remove the column containing the Cantonese name for the horse, jockey name in Cantonese, jockey url, trainer name in Cantonese, trainer url, race url, running positions (redundant information),and last updated, and length behind winner. Length behind winner being deleted is okay because we have LBW for all 4 sections in another variable.
<-performances[,-c(3,15,16,19,20,23,28,33,37)]
perf.cleanhead(perf.clean,2)
horse_id horse_name horse_no race_id race_no race_date race_country
1 34204 LORD GREG NA NA 0 2007-06-09 SG
2 34204 LORD GREG NA NA 0 2007-07-01 SG
final_placing season going rating jockey_id jockey_name trainer_id
1 3 2006/2007 G 37 O CHAVEZ O CHAVEZ O CHAVEZ
2 2 2006/2007 G 37 B VORSTER B VORSTER B VORSTER
trainer_name winning_odds actual_weight finish_time on_date_weight gears draw
1 L LAXON 2.6 53.5 1:24:50 497 B 6
2 L LAXON 2.4 54.0 0.076736111 500 B 2
race_class distance course race_location track sections sections_time
1 5 1400 S Turf
2 5 1800 S Turf
We are going to once again filter out any race in the dataset that happened in Singapore.
<-perf.clean%>%
perf.cleanfilter(race_country == "HK")
Let’s get rid of race_country column now that all of the values should equal “HK”
<-perf.clean%>%
perf.cleanselect(-race_country)
Now I am going to break the section data into 4 distinct columns, both for LBW and time.
# Getting rid of brackets and commas in the columns
$sections <- str_replace_all(perf.clean$sections,"\\[|\\]|\\,|\\{","")
perf.clean$sections <- str_replace_all(perf.clean$sections,"\\}",",")
perf.clean$sections_time <- str_replace_all(perf.clean$sections_time,"\\[|\\]|\\,|\\{","")
perf.clean$sections_time <- str_replace_all(perf.clean$sections_time,"\\}",",") perf.clean
# splitting each section into their own column
<-str_split_fixed(perf.clean$sections,",",6)
split2 $section1<-split2[,1]
perf.clean$section2<-split2[,2]
perf.clean$section3<-split2[,3]
perf.clean$section4<-split2[,4]
perf.clean$section5<-split2[,5]
perf.clean$section6<-split2[,6] perf.clean
# same thing for section time
<-str_split_fixed(perf.clean$sections_time,",",6)
split3 $section1_time<-split3[,1]
perf.clean$section2_time<-split3[,2]
perf.clean$section3_time<-split3[,3]
perf.clean$section4_time<-split3[,4]
perf.clean$section5_time<-split3[,5]
perf.clean$section6_time<-split3[,6] perf.clean
Getting rid of our original section and section_time columns for redundancy reasons.
<-select(perf.clean,-c(sections,sections_time)) perf.clean
Removing characters from section time column and converting to numeric
library(mgsub)
$section1_time <- mgsub(perf.clean$section1_time,c('time','"',':',','),
perf.cleanc('','','',''))
$section2_time <- mgsub(perf.clean$section2_time,c('time','"',':',','),
perf.cleanc('','','',''))
$section3_time <- mgsub(perf.clean$section3_time,c('time','"',':',','),
perf.cleanc('','','',''))
$section4_time <- mgsub(perf.clean$section4_time,c('time','"',':',','),
perf.cleanc('','','',''))
$section5_time <- mgsub(perf.clean$section5_time,c('time','"',':',','),
perf.cleanc('','','',''))
$section6_time <- mgsub(perf.clean$section6_time,c('time','"',':',','),
perf.cleanc('','','',''))
Saving the section time columns as numeric, along with our finishing time column.
$section1_time <- as.numeric(perf.clean$section1_time) perf.clean
Warning: NAs introduced by coercion
$section2_time <- as.numeric(perf.clean$section2_time) perf.clean
Warning: NAs introduced by coercion
$section3_time <- as.numeric(perf.clean$section3_time) perf.clean
Warning: NAs introduced by coercion
$section4_time <- as.numeric(perf.clean$section4_time) perf.clean
Warning: NAs introduced by coercion
$section5_time <- as.numeric(perf.clean$section5_time) perf.clean
Warning: NAs introduced by coercion
$section6_time <- as.numeric(perf.clean$section6_time) perf.clean
$finish_time <- rowSums(perf.clean[,c("section1_time","section2_time",
perf.clean"section3_time","section4_time",
"section5_time","section6_time")],na.rm=TRUE)
head(perf.clean,2)
horse_id horse_name horse_no race_id race_no race_date final_placing
1 A001 BURST AWAY 6 250 1 2016-12-11 11
2 A001 BURST AWAY 6 313 6 2017-01-04 11
season going rating jockey_id jockey_name trainer_id trainer_name
1 2016/2017 G 72 MG G Mosse MKL K L Man
2 2016/2017 G 72 YML M L Yeung MKL K L Man
winning_odds actual_weight finish_time on_date_weight gears draw race_class
1 49 125 70.04 1083 13 3
2 30 124 71.86 1073 6 3
distance course race_location track section1
1 1200 A ST Turf "lbw": "5-1/4" "placing": 9
2 1200 A HV Turf "lbw": "2-1/4" "placing": 4
section2 section3 section4 section5
1 "lbw": "3" "placing": 7 "lbw": "4" "placing": 11
2 "lbw": "1-1/2" "placing": 3 "lbw": "6-1/2" "placing": 11
section6 section1_time section2_time section3_time section4_time
1 24.21 22.03 23.80 NA
2 23.73 23.61 24.52 NA
section5_time section6_time
1 NA NA
2 NA NA
Now I want to create a variable that is a measure of a horse’s past performance. A past performance metric could be created to be a very complex variable that weights a multitude of features such as increases in rating over time, historical placement, finishing time history, etc.. I am going to simply create a variable that is the average placement of the horse up until the race date. I could do average race finishing time, but due to every horse running at different distances, I would have to create a an average finishing time for a horse for each distance up until the race date. This method could get computationally ineffcient and also really clutter our data with high dimensionality. For this reason, I am just going to do average placement up until the race date.
# Convert race_date to a Date object
$race_date <- as.Date(perf.clean$race_date)
perf.clean
# Arrange the data by horse_id and race_date
<- perf.clean %>%
perf.clean arrange(horse_id, race_date)
<- perf.clean %>%
perf.clean group_by(horse_id) %>%
mutate(average_placement = lag(cummean(final_placing))) %>%
ungroup()
# For the first race of each horse, set average_placement to NA
$average_placement[!duplicated(perf.clean$horse_id)] <- NA
perf.clean
# View the updated data frame
head(perf.clean)
# A tibble: 6 × 38
horse_id horse_name horse_no race_id race_no race_date final_placing season
<chr> <chr> <int> <int> <int> <date> <int> <chr>
1 A001 BURST AWAY 6 250 1 2016-12-11 11 2016/20…
2 A001 BURST AWAY 6 313 6 2017-01-04 11 2016/20…
3 A001 BURST AWAY 8 380 10 2017-01-30 6 2016/20…
4 A001 BURST AWAY 7 87 5 2017-10-05 9 2017/20…
5 A001 BURST AWAY 8 271 8 2017-12-13 11 2017/20…
6 A001 BURST AWAY 14 607 5 2018-04-29 9 2017/20…
# ℹ 30 more variables: going <chr>, rating <int>, jockey_id <chr>,
# jockey_name <chr>, trainer_id <chr>, trainer_name <chr>,
# winning_odds <dbl>, actual_weight <dbl>, finish_time <dbl>,
# on_date_weight <int>, gears <chr>, draw <int>, race_class <chr>,
# distance <int>, course <chr>, race_location <chr>, track <chr>,
# section1 <chr>, section2 <chr>, section3 <chr>, section4 <chr>,
# section5 <chr>, section6 <chr>, section1_time <dbl>, section2_time <dbl>, …
Now I also want to create a variable that indicates how much rest a horse was able to get between races.
<- perf.clean %>%
perf.clean group_by(horse_id) %>%
mutate(days_between = c(0, diff(race_date)))
head(perf.clean)
# A tibble: 6 × 39
# Groups: horse_id [1]
horse_id horse_name horse_no race_id race_no race_date final_placing season
<chr> <chr> <int> <int> <int> <date> <int> <chr>
1 A001 BURST AWAY 6 250 1 2016-12-11 11 2016/20…
2 A001 BURST AWAY 6 313 6 2017-01-04 11 2016/20…
3 A001 BURST AWAY 8 380 10 2017-01-30 6 2016/20…
4 A001 BURST AWAY 7 87 5 2017-10-05 9 2017/20…
5 A001 BURST AWAY 8 271 8 2017-12-13 11 2017/20…
6 A001 BURST AWAY 14 607 5 2018-04-29 9 2017/20…
# ℹ 31 more variables: going <chr>, rating <int>, jockey_id <chr>,
# jockey_name <chr>, trainer_id <chr>, trainer_name <chr>,
# winning_odds <dbl>, actual_weight <dbl>, finish_time <dbl>,
# on_date_weight <int>, gears <chr>, draw <int>, race_class <chr>,
# distance <int>, course <chr>, race_location <chr>, track <chr>,
# section1 <chr>, section2 <chr>, section3 <chr>, section4 <chr>,
# section5 <chr>, section6 <chr>, section1_time <dbl>, section2_time <dbl>, …
Export cleaned data to csv
write.csv(horses.clean,"../../../data/01-modified-data/horses.clean.csv",row.names = FALSE)
write.csv(perf.clean,"../../../data/01-modified-data/perf.clean.csv",row.names = FALSE)