Chapter 21 Feature Engineering : First, Second , Third Set Winners and Losers
We extract features of the first set , second set and third set winners
whowon = function(scores,setnumber)
{
scores2 = str_split(scores," ")
set = scores2[[1]][setnumber]
set_score = str_split(set,"-")
winner_score = as.numeric(set_score[[1]][1])
loser_score =as.numeric(str_split(set_score[[1]][2],"")[[1]][1])
if( (is.na(winner_score)) ||
(is.na(loser_score))
)
{
setwinner = ""
}else
{
if(winner_score > loser_score)
{
setwinner = "winner"
}else
{
setwinner = "loser"
}
}
return(setwinner)
}
matches$first_set = sapply(matches$score,whowon, setnumber = 1)
matches$second_set = sapply(matches$score,whowon, setnumber = 2)
matches$third_set = sapply(matches$score,whowon, setnumber = 3)
21.1 Percentage of Winners after losing 1st set
Percentage of Winners after losing 1st set
first_set_loser = matches %>%
filter(first_set == "loser")
nrow(first_set_loser)/nrow(matches) *100
## [1] 16.46598
21.2 Most wins after losing the first set
The bar plot shows the winners after losing the first set
first_set_loser %>%
group_by(winner_name) %>%
summarise(Count = n()) %>%
arrange(desc(Count)) %>%
ungroup() %>%
mutate(winner_name = reorder(winner_name,Count)) %>%
head(10) %>%
ggplot(aes(x = winner_name,y = Count)) +
geom_bar(stat='identity',colour="white", fill = fillColor2) +
geom_text(aes(x = winner_name, y = 1, label = paste0("(",Count,")",sep="")),
hjust=0, vjust=.5, size = 4, colour = 'black',
fontface = 'bold') +
labs(x = 'Winner',
y = 'Count',
title = 'Winner') +
coord_flip() +
theme_bw()
21.3 Percentage of Winners after losing 2nd set
Percentage of Winners after losing 2nd set
second_set_loser = matches %>%
filter(second_set == "loser")
nrow(second_set_loser)/nrow(matches) *100
## [1] 14.64302
21.4 Most wins after losing the second set
The bar plot shows the winners after losing the second set
second_set_loser = matches %>%
filter(second_set == "loser")
second_set_loser %>%
group_by(winner_name) %>%
summarise(Count = n()) %>%
arrange(desc(Count)) %>%
ungroup() %>%
mutate(winner_name = reorder(winner_name,Count)) %>%
head(10) %>%
ggplot(aes(x = winner_name,y = Count)) +
geom_bar(stat='identity',colour="white", fill = fillColor) +
geom_text(aes(x = winner_name, y = 1, label = paste0("(",Count,")",sep="")),
hjust=0, vjust=.5, size = 4, colour = 'black',
fontface = 'bold') +
labs(x = 'Winner',
y = 'Count',
title = 'Winner') +
coord_flip() +
theme_bw()
21.5 Percentage of Winners after losing 1st set in Grand Slams
We calculate the Percentage of winners after losing the 1st set
gs_final_firstset_loser = matches %>%
filter(tourney_level == "G") %>%
filter(round == "F") %>%
filter(first_set == "loser")
gs_final_secondset_loser = matches %>%
filter(tourney_level == "G") %>%
filter(round == "F") %>%
filter(second_set == "loser")
gs_final_thirdset_loser = matches %>%
filter(tourney_level == "G") %>%
filter(round == "F") %>%
filter(third_set == "loser")
gs_final = matches %>%
filter(tourney_level == "G") %>%
filter(round == "F")
nrow(gs_final_firstset_loser)/nrow(gs_final) *100
## [1] 12.5
21.6 Percentage of Winners after losing 2nd set in Grand Slams
We calculate the Percentage of winners after losing the 2nd set
nrow(gs_final_secondset_loser)/nrow(gs_final) *100
## [1] 15.27778
21.7 Percentage of Winners after losing 1st set in Australian Open
We calculate the Percentage of winners after losing the 1st set
percentWinnersTourney = function(matches,tournamentName,loser = 1)
{
gs_final_firstset_loser = matches %>%
filter(tourney_name == tournamentName) %>%
filter(round == "F") %>%
filter(first_set == "loser")
gs_final_secondset_loser = matches %>%
filter(tourney_name == tournamentName) %>%
filter(round == "F") %>%
filter(second_set == "loser")
gs_final = matches %>%
filter(tourney_name == tournamentName) %>%
filter(round == "F")
if(loser == 1)
{
nrow(gs_final_firstset_loser)/nrow(gs_final) *100
}
else{
nrow(gs_final_secondset_loser)/nrow(gs_final) *100
}
}
displayGrandSlamWinnersAfterLosingFirstSet = function(matches,tournamentName)
{
gs_final_firstset_loser = matches %>%
filter(tourney_name == tournamentName) %>%
filter(round == "F") %>%
filter(first_set == "loser") %>%
select(winner_name,loser_name,year,score)
kable(gs_final_firstset_loser,"html") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
scroll_box(width = "800px")
}
percentWinnersTourney(matches,"Australian Open")
## [1] 22.22222
displayGrandSlamWinnersAfterLosingFirstSet(matches,"Australian Open")
winner_name | loser_name | year | score |
---|---|---|---|
Jennifer Capriati | Martina Hingis | 2002 | 4-6 7-6(7) 6-2 |
Serena Williams | Lindsay Davenport | 2005 | 2-6 6-3 6-0 |
Kim Clijsters | Na Li | 2011 | 3-6 6-3 6-3 |
Victoria Azarenka | Na Li | 2013 | 4-6 6-4 6-3 |
21.8 Percentage of Winners after losing 2nd set in Australian Open
percentWinnersTourney(matches,"Australian Open",2)
## [1] 22.22222
21.9 Percentage of Winners after losing 1st set in Wimbledon
We calculate the Percentage of winners after losing the 1st set
percentWinnersTourney(matches,"Wimbledon")
## [1] 16.66667
displayGrandSlamWinnersAfterLosingFirstSet(matches,"Wimbledon")
winner_name | loser_name | year | score |
---|---|---|---|
Serena Williams | Venus Williams | 2003 | 4-6 6-4 6-2 |
Venus Williams | Lindsay Davenport | 2005 | 4-6 7-6(4) 9-7 |
Amelie Mauresmo | Justine Henin | 2006 | 2-6 6-3 6-4 |
21.10 Percentage of Winners after losing 1st set in French Open
We calculate the Percentage of winners after losing the 1st set
percentWinnersTourney(matches,"French Open")
## [1] 5.882353
displayGrandSlamWinnersAfterLosingFirstSet(matches,"French Open")
winner_name | loser_name | year | score |
---|---|---|---|
Jennifer Capriati | Kim Clijsters | 2001 | 1-6 6-4 12-10 |
21.11 Percentage of Winners after losing 1st set in US Open
We calculate the Percentage of winners after losing the 1st set
percentWinnersTourney(matches,"US Open")
## [1] 0
displayGrandSlamWinnersAfterLosingFirstSet(matches,"US Open")
winner_name | loser_name | year | score |
---|---|---|---|
It is evident that losing the 1st set in French and US means practically losing the Final. Zero times and only Once that a player has won the US and French respectively after losing the 1st set.