Data Cleaning

This section was completed using R.

horses<-read.csv("../../../data/00-raw-data/horses.csv")
performances<-read.csv("../../../data/00-raw-data/performances.csv")

Let’s start by cleaning horses:

head(horses,2)

  horse_id       horse_name horse_name_zh
1     L247     REAL SUPREME      天外之天
2    44170 LIM'S INVINCIBLE      林家无敌
                                                                         horse_url
1 http://racing.hkjc.com/racing/info/horse/GetFilteredHorseFormRecord/english/L247
2          http://www.turfclub.com.sg/Racing/Pages/HorseProfile.aspx?HorseID=44170
  horse_country is_active country age             trainer_id   colour     sex
1            HK      TRUE     AUS   9                    CCW Chestnut Gelding
2            SG      TRUE     AUS   3 STEVEN HAROLD BURRIDGE                 
                 owner_name
1 Lai Ni Jan | Benvinda Lai
2              LIM'S STABLE
                                            owner_link import_type
1 ownersearch.asp?horseowner=Lai Ni Jan | Benvinda Lai         ISG
2                                                                 
  current_rating season_stake start_of_season_rating total_stake
1             34        37800                     39     6607700
2             50           NA                     NA       38182
                   sire
1           Flying Spur
2 I AM INVINCIBLE (AUS)
                                                                           same_sire_url
1 http://racing.hkjc.com/racing/info/Horse/SameSire/english/Rmx5aW5nIFNwdXIgICAgICAgICA=
2                                                                                       
  no_of_1_2_3                 dam no_of_start_past_10_meetings dam_sire
1    8-5-6-75           Irgunette                            1    Irgun
2             REIGNING CASH (AUS)                           NA         
                         same_sire last_updated
1 ["T387", "M273", "T280", "V334"]      08:18.0
2                                       11:32.8

We are going to remove “horse_name_zh” which contains the horses’ names in Cantonese, “horse_url”, “owner_url”, “sire”, “no_of_start_past_10_meetings”, “dam”,“dam_sire”, “same_sire and”last_updated” which are all irrelevant to our analysis.

horses.clean<-horses[,-c(3,4,13,16,19,20,22,23,24,25,26)]
head(horses.clean,2)

  horse_id       horse_name horse_country is_active country age
1     L247     REAL SUPREME            HK      TRUE     AUS   9
2    44170 LIM'S INVINCIBLE            SG      TRUE     AUS   3
              trainer_id   colour     sex                owner_name import_type
1                    CCW Chestnut Gelding Lai Ni Jan | Benvinda Lai         ISG
2 STEVEN HAROLD BURRIDGE                               LIM'S STABLE            
  current_rating start_of_season_rating total_stake no_of_1_2_3
1             34                     39     6607700    8-5-6-75
2             50                     NA       38182

We are only interested in horses that run in Hong Kong while there are some horses in this data set that raced in Singapore.

library(dplyr)


Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

horses.clean<-horses.clean%>%
  filter(horse_country == "HK")

Now we want to clean “no_of_1_2_3”. This column represents how many 1st, 2nd, and 3rd place finishes the horse has ever had. The last number is the total number of races the horse has participated in. We are going to break up this column into seperate columns for placing finishes and number of races.

library(stringr)

split1 <-str_split_fixed(horses.clean$no_of_1_2_3, "-", 4)
horses.clean$first_place <- split1[,1]
horses.clean$second_place <- split1[,2]
horses.clean$third_place <- split1[,3]
horses.clean$races <- split1[,4]
# Remove original column
horses.clean <- horses.clean[,-15]
head(horses.clean,2)

  horse_id     horse_name horse_country is_active country age trainer_id
1     L247   REAL SUPREME            HK      TRUE     AUS   9        CCW
2     T422 AMAZING MOMENT            HK      TRUE     AUS   6        OSP
    colour     sex                owner_name import_type current_rating
1 Chestnut Gelding Lai Ni Jan | Benvinda Lai         ISG             34
2      Bay Gelding   Treinta Pares Syndicate          PP             60
  start_of_season_rating total_stake first_place second_place third_place races
1                     39     6607700           8            5           6    75
2                     57     1337225           1            1           4    19

Now let’s clean Performances

head(performances,2)

  horse_id horse_name horse_name_zh horse_no race_id race_no  race_date
1    34204  LORD GREG     LORD GREG       NA      NA       0 2007-06-09
2    34204  LORD GREG     LORD GREG       NA      NA       0 2007-07-01
  race_country final_placing    season going rating jockey_id jockey_name
1           SG             3 2006/2007     G     37  O CHAVEZ    O CHAVEZ
2           SG             2 2006/2007     G     37 B VORSTER   B VORSTER
  jockey_name_zh
1         查维斯
2         沃斯特
                                                                                    jockey_url
1  http://www.turfclub.com.sg/Industry/Jockeys/Pages/JockeyDirectoryDetails.aspx?Name=O+CHAVEZ
2 http://www.turfclub.com.sg/Industry/Jockeys/Pages/JockeyDirectoryDetails.aspx?Name=B+VORSTER
  trainer_id trainer_name trainer_name_zh
1   O CHAVEZ      L LAXON            纳逊
2  B VORSTER      L LAXON            纳逊
                                                                                          trainer_url
1 http://www.turfclub.com.sg/Industry/Trainers/Pages/TrainerDirectoryDetails.aspx?TrainerName=L+LAXON
2 http://www.turfclub.com.sg/Industry/Trainers/Pages/TrainerDirectoryDetails.aspx?TrainerName=L+LAXON
  winning_odds actual_weight running_positions finish_time on_date_weight gears
1          2.6          53.5             7 7 3     1:24:50            497     B
2          2.4          54.0             4 3 2 0.076736111            500     B
  draw length_behind_winner race_class distance course race_location race_url
1    6                 6.8L          5     1400                    S         
2    2                 3.8L          5     1800                    S         
  track sections sections_time last_updated
1  Turf                             00:41.5
2  Turf                             00:41.5

We are going to remove the column containing the Cantonese name for the horse, jockey name in Cantonese, jockey url, trainer name in Cantonese, trainer url, race url, running positions (redundant information),and last updated, and length behind winner. Length behind winner being deleted is okay because we have LBW for all 4 sections in another variable.

perf.clean<-performances[,-c(3,15,16,19,20,23,28,33,37)]
head(perf.clean,2)

  horse_id horse_name horse_no race_id race_no  race_date race_country
1    34204  LORD GREG       NA      NA       0 2007-06-09           SG
2    34204  LORD GREG       NA      NA       0 2007-07-01           SG
  final_placing    season going rating jockey_id jockey_name trainer_id
1             3 2006/2007     G     37  O CHAVEZ    O CHAVEZ   O CHAVEZ
2             2 2006/2007     G     37 B VORSTER   B VORSTER  B VORSTER
  trainer_name winning_odds actual_weight finish_time on_date_weight gears draw
1      L LAXON          2.6          53.5     1:24:50            497     B    6
2      L LAXON          2.4          54.0 0.076736111            500     B    2
  race_class distance course race_location track sections sections_time
1          5     1400                    S  Turf                       
2          5     1800                    S  Turf

We are going to once again filter out any race in the dataset that happened in Singapore.

perf.clean<-perf.clean%>%
  filter(race_country == "HK")

Let’s get rid of race_country column now that all of the values should equal “HK”

perf.clean<-perf.clean%>%
  select(-race_country)

Now I am going to break the section data into 4 distinct columns, both for LBW and time.

# Getting rid of brackets and commas in the columns
perf.clean$sections <- str_replace_all(perf.clean$sections,"\\[|\\]|\\,|\\{","")
perf.clean$sections <- str_replace_all(perf.clean$sections,"\\}",",")
perf.clean$sections_time <- str_replace_all(perf.clean$sections_time,"\\[|\\]|\\,|\\{","")
perf.clean$sections_time <- str_replace_all(perf.clean$sections_time,"\\}",",")

# splitting each section into their own column
split2 <-str_split_fixed(perf.clean$sections,",",6)
perf.clean$section1<-split2[,1]
perf.clean$section2<-split2[,2]
perf.clean$section3<-split2[,3]
perf.clean$section4<-split2[,4]
perf.clean$section5<-split2[,5]
perf.clean$section6<-split2[,6]

# same thing for section time
split3 <-str_split_fixed(perf.clean$sections_time,",",6)
perf.clean$section1_time<-split3[,1]
perf.clean$section2_time<-split3[,2]
perf.clean$section3_time<-split3[,3]
perf.clean$section4_time<-split3[,4]
perf.clean$section5_time<-split3[,5]
perf.clean$section6_time<-split3[,6]

Getting rid of our original section and section_time columns for redundancy reasons.

perf.clean<-select(perf.clean,-c(sections,sections_time))

Removing characters from section time column and converting to numeric

library(mgsub)
perf.clean$section1_time <- mgsub(perf.clean$section1_time,c('time','"',':',','),
                                  c('','','',''))
perf.clean$section2_time <- mgsub(perf.clean$section2_time,c('time','"',':',','),
                                  c('','','',''))
perf.clean$section3_time <- mgsub(perf.clean$section3_time,c('time','"',':',','),
                                  c('','','',''))
perf.clean$section4_time <- mgsub(perf.clean$section4_time,c('time','"',':',','),
                                  c('','','',''))
perf.clean$section5_time <- mgsub(perf.clean$section5_time,c('time','"',':',','),
                                  c('','','',''))
perf.clean$section6_time <- mgsub(perf.clean$section6_time,c('time','"',':',','),
                                  c('','','',''))

Saving the section time columns as numeric, along with our finishing time column.

perf.clean$section1_time <- as.numeric(perf.clean$section1_time)

Warning: NAs introduced by coercion

perf.clean$section2_time <- as.numeric(perf.clean$section2_time)

Warning: NAs introduced by coercion

perf.clean$section3_time <- as.numeric(perf.clean$section3_time)

Warning: NAs introduced by coercion

perf.clean$section4_time <- as.numeric(perf.clean$section4_time)

Warning: NAs introduced by coercion

perf.clean$section5_time <- as.numeric(perf.clean$section5_time)

Warning: NAs introduced by coercion

perf.clean$section6_time <- as.numeric(perf.clean$section6_time)

perf.clean$finish_time <- rowSums(perf.clean[,c("section1_time","section2_time",
                                                "section3_time","section4_time",
                                                "section5_time","section6_time")],na.rm=TRUE)

head(perf.clean,2)

  horse_id horse_name horse_no race_id race_no  race_date final_placing
1     A001 BURST AWAY        6     250       1 2016-12-11            11
2     A001 BURST AWAY        6     313       6 2017-01-04            11
     season going rating jockey_id jockey_name trainer_id trainer_name
1 2016/2017     G     72        MG     G Mosse        MKL      K L Man
2 2016/2017     G     72       YML   M L Yeung        MKL      K L Man
  winning_odds actual_weight finish_time on_date_weight gears draw race_class
1           49           125       70.04           1083         13          3
2           30           124       71.86           1073          6          3
  distance course race_location track                    section1
1     1200      A            ST  Turf "lbw": "5-1/4" "placing": 9
2     1200      A            HV  Turf "lbw": "2-1/4" "placing": 4
                      section2                      section3 section4 section5
1      "lbw": "3" "placing": 7      "lbw": "4" "placing": 11                  
2  "lbw": "1-1/2" "placing": 3  "lbw": "6-1/2" "placing": 11                  
  section6 section1_time section2_time section3_time section4_time
1                  24.21         22.03         23.80            NA
2                  23.73         23.61         24.52            NA
  section5_time section6_time
1            NA            NA
2            NA            NA

Now I want to create a variable that is a measure of a horse’s past performance. A past performance metric could be created to be a very complex variable that weights a multitude of features such as increases in rating over time, historical placement, finishing time history, etc.. I am going to simply create a variable that is the average placement of the horse up until the race date. I could do average race finishing time, but due to every horse running at different distances, I would have to create a an average finishing time for a horse for each distance up until the race date. This method could get computationally ineffcient and also really clutter our data with high dimensionality. For this reason, I am just going to do average placement up until the race date.

# Convert race_date to a Date object
perf.clean$race_date <- as.Date(perf.clean$race_date)

# Arrange the data by horse_id and race_date
perf.clean <- perf.clean %>%
  arrange(horse_id, race_date)

perf.clean <- perf.clean %>%
  group_by(horse_id) %>%
  mutate(average_placement = lag(cummean(final_placing))) %>%
  ungroup()

# For the first race of each horse, set average_placement to NA
perf.clean$average_placement[!duplicated(perf.clean$horse_id)] <- NA

# View the updated data frame
head(perf.clean)

# A tibble: 6 × 38
  horse_id horse_name horse_no race_id race_no race_date  final_placing season  
  <chr>    <chr>         <int>   <int>   <int> <date>             <int> <chr>   
1 A001     BURST AWAY        6     250       1 2016-12-11            11 2016/20…
2 A001     BURST AWAY        6     313       6 2017-01-04            11 2016/20…
3 A001     BURST AWAY        8     380      10 2017-01-30             6 2016/20…
4 A001     BURST AWAY        7      87       5 2017-10-05             9 2017/20…
5 A001     BURST AWAY        8     271       8 2017-12-13            11 2017/20…
6 A001     BURST AWAY       14     607       5 2018-04-29             9 2017/20…
# ℹ 30 more variables: going <chr>, rating <int>, jockey_id <chr>,
#   jockey_name <chr>, trainer_id <chr>, trainer_name <chr>,
#   winning_odds <dbl>, actual_weight <dbl>, finish_time <dbl>,
#   on_date_weight <int>, gears <chr>, draw <int>, race_class <chr>,
#   distance <int>, course <chr>, race_location <chr>, track <chr>,
#   section1 <chr>, section2 <chr>, section3 <chr>, section4 <chr>,
#   section5 <chr>, section6 <chr>, section1_time <dbl>, section2_time <dbl>, …

Now I also want to create a variable that indicates how much rest a horse was able to get between races.

perf.clean <- perf.clean %>%
  group_by(horse_id) %>%
  mutate(days_between = c(0, diff(race_date)))

head(perf.clean)

# A tibble: 6 × 39
# Groups:   horse_id [1]
  horse_id horse_name horse_no race_id race_no race_date  final_placing season  
  <chr>    <chr>         <int>   <int>   <int> <date>             <int> <chr>   
1 A001     BURST AWAY        6     250       1 2016-12-11            11 2016/20…
2 A001     BURST AWAY        6     313       6 2017-01-04            11 2016/20…
3 A001     BURST AWAY        8     380      10 2017-01-30             6 2016/20…
4 A001     BURST AWAY        7      87       5 2017-10-05             9 2017/20…
5 A001     BURST AWAY        8     271       8 2017-12-13            11 2017/20…
6 A001     BURST AWAY       14     607       5 2018-04-29             9 2017/20…
# ℹ 31 more variables: going <chr>, rating <int>, jockey_id <chr>,
#   jockey_name <chr>, trainer_id <chr>, trainer_name <chr>,
#   winning_odds <dbl>, actual_weight <dbl>, finish_time <dbl>,
#   on_date_weight <int>, gears <chr>, draw <int>, race_class <chr>,
#   distance <int>, course <chr>, race_location <chr>, track <chr>,
#   section1 <chr>, section2 <chr>, section3 <chr>, section4 <chr>,
#   section5 <chr>, section6 <chr>, section1_time <dbl>, section2_time <dbl>, …

Export cleaned data to csv

write.csv(horses.clean,"../../../data/01-modified-data/horses.clean.csv",row.names = FALSE)

write.csv(perf.clean,"../../../data/01-modified-data/perf.clean.csv",row.names = FALSE)