Die folgende explorative Analyse rund um die Olympischen Sommerspiele basiert auf einem Datensatz, welcher auf der Plattform Kaggle - einer Online-Community von und für Datenwissenschaftler:innen - von User “rgriffin” unter dem Titel “120 years of Olympic History: Athletes and Results.”[1] zur Verfügung gestellt wurde. Der Hauptzweck von Kaggle ist die Organisation von Data-Science-Wettbewerben. Der bereitgestellte Datensatz umfasst alle Olympischen Sommer- und Winterspiele im Zeitraum 1896 bis 2016 und besteht aus 271.116 Sportler:innen mit 15 Attributen, d.h. ca. 4 Millionen Datenpunkten.
Das GOKA-Team hat hierbei den Fokus auf die Agediversity Dimension gelegt. Probiere die interaktiven Grafiken aus, um weitere Details zu erkunden.
<- data_events%>%filter(!is.na(Medal),!is.na(Year),Season=='Summer',Year>1890)%>%left_join(data_team,by=c("NOC"="NOC"))
sport $notes<-NULL
sport<- sport%>%group_by(Sport,region)%>%summarize(total=n())
sport
=c('Archery','Athletics','Basketball','Canoeing','Cycling','Equestrianism','Fencing','Gymnastics','Hockey','Rowing','Sailing','Shooting','Swimming','Table Tennis','Tennis','Weightlifting','Wrestling')
slist <- filter(sport,Sport%in%slist,total>=1)
sport
<- data_events%>%filter(!is.na(Medal),!is.na(Year),Season=='Summer',Year>1890)
age_mi <- age_mi%>%group_by(Sex,Sport)%>%summarize(Age=min(Age,na.rm = TRUE))
age_mi <- data_events%>%filter(!is.na(Medal),Season=="Summer")%>%right_join(age_mi,by=c("Sex","Sport","Age"))
age_min
<- filter(age_min,Sport%in%slist)
age_min
<-ggplot(age_min,aes(Sport,Age, color=Sport,fill=Name)) +
c geom_point(position = "dodge", width =.5,stat="identity") +
scale_color_viridis_d() +
coord_flip()+
facet_wrap(~Sex)+
theme_grey() +
#scale_x_discrete() +
scale_y_continuous(breaks=seq(10,22,by=2)) +
xlab("")+ylab("Alter")+
theme(legend.position = "none",
axis.text = element_text(size = 8,face="bold"),
plot.title = element_text(size=12,face = "bold")) #+
#ggtitle("Jüngste Olympia-Medaillengewinner:innen in ausgewählten Sportarten")
ggplotly(c)
<- data_events%>%filter(!is.na(Medal),!is.na(Year),Season=='Summer',Year>1890)%>%left_join(data_team,by=c("NOC"="NOC"))
sport $notes<-NULL
sport<- sport%>%group_by(Sport,region)%>%summarize(total=n())
sport
=c('Archery','Athletics','Basketball','Canoeing','Cycling','Equestrianism','Fencing','Gymnastics','Hockey','Rowing','Sailing','Shooting','Swimming','Table Tennis','Tennis','Weightlifting','Wrestling')
slist <- filter(sport,Sport%in%slist,total>=1)
sport
<- data_events%>%filter(!is.na(Medal),Season=='Summer')
age_mt <-age_mt%>%group_by(Sex,Sport)%>%summarize(Age=max(Age,na.rm = TRUE))
age_mt <- data_events%>%filter(!is.na(Medal),Season=="Summer")%>%right_join(age_mt,by=c("Sex","Sport","Age"))
age_max
<- filter(age_max,Sport%in%slist)
age_max
<-ggplot(age_max,aes(Sport,Age, color=Sport,fill=Name)) +
c geom_point(position = "dodge", width =.5,stat="identity") +
scale_color_viridis_d() +
coord_flip()+
facet_wrap(~Sex)+
#theme_grey() +
#scale_x_discrete() +
scale_y_continuous(breaks=seq(30,75,by=5)) +
xlab("")+ ylab("Alter")+
theme(legend.position = "none",
axis.text = element_text(size = 8,face="bold"),
plot.title = element_text(size=12,face = "bold")) #+
#ggtitle("Älteste Olympia-Medaillengewinner:innen in ausgewählten Sportarten")
ggplotly(c)
<- data_events%>%filter(Season=='Summer')
age_red <- age_red%>%group_by(Sex,Sport)%>%summarize(Age=quantile(Age,na.rm = TRUE))
age_red
<- data_events%>%filter(Season=="Summer")%>%right_join(age_red,by=c("Sex","Sport","Age"))
age_dist
do.call("rbind",
tapply(age_dist$Age, # Specify numeric column
$Sport, # Specify group variable
age_dist quantile))
## 0% 25% 50% 75% 100%
## Aeronautics 26 26.0000 26.00 26.000 26
## Alpinism 22 29.7500 43.00 43.000 49
## Archery 14 21.0000 25.00 26.000 71
## Art Competitions 14 37.0000 45.00 45.000 97
## Athletics 12 22.0000 25.00 28.000 52
## Badminton 16 24.0000 25.00 27.500 44
## Baseball 16 23.0000 26.00 29.000 44
## Basketball 16 22.0000 25.00 28.000 40
## Basque Pelota 26 26.0000 26.00 26.000 26
## Beach Volleyball 18 27.0000 30.00 31.000 41
## Boxing 15 20.0000 23.00 25.000 41
## Canoeing 15 22.0000 25.00 28.000 48
## Cricket 21 24.0000 24.00 27.500 44
## Croquet 15 30.7500 36.00 46.000 58
## Cycling 14 21.0000 24.00 26.000 49
## Diving 12 20.0000 21.00 24.000 51
## Equestrianism 16 28.0000 34.00 40.000 72
## Fencing 14 24.0000 28.00 29.000 63
## Figure Skating 18 26.0000 26.75 36.000 45
## Football 15 21.0000 23.00 23.000 40
## Golf 15 24.5000 30.00 37.000 50
## Gymnastics 10 18.0000 22.00 24.000 49
## Handball 14 24.0000 26.00 29.000 43
## Hockey 15 23.0000 25.00 28.000 44
## Ice Hockey 18 23.0000 25.00 29.000 46
## Jeu De Paume 19 29.0000 30.00 38.125 43
## Judo 14 23.0000 25.00 28.000 43
## Lacrosse 14 24.0000 26.00 30.000 37
## Modern Pentathlon 15 23.0000 26.00 29.000 53
## Motorboating 26 26.0000 27.50 30.500 54
## Polo 21 31.0000 35.00 39.000 53
## Racquets 19 24.7500 31.00 38.000 45
## Rhythmic Gymnastics 13 17.0000 18.00 18.000 30
## Roque 37 48.0000 59.00 61.500 64
## Rowing 11 22.0000 25.00 25.000 60
## Rugby 18 22.0000 24.00 26.000 39
## Rugby Sevens 18 23.1875 26.00 28.000 36
## Sailing 13 25.0000 29.00 29.000 71
## Shooting 15 28.0000 33.00 34.000 72
## Softball 17 23.0000 26.00 26.000 41
## Swimming 11 19.0000 19.00 22.000 46
## Synchronized Swimming 15 20.0000 22.00 25.000 40
## Table Tennis 15 23.0000 25.00 29.000 54
## Taekwondo 16 22.0000 24.00 26.000 37
## Tennis 13 23.0000 26.00 28.000 47
## Trampolining 18 22.0000 24.00 27.000 39
## Triathlon 18 25.0000 28.00 30.000 42
## Tug-Of-War 17 25.2500 29.00 33.000 45
## Volleyball 15 23.0000 24.00 27.000 41
## Water Polo 14 22.0000 25.00 28.000 45
## Weightlifting 15 23.0000 25.00 27.000 45
## Wrestling 15 23.0000 25.00 28.000 50
<- data_events%>%filter(!is.na(Year),Season=='Summer',Year>1890)%>%left_join(data_team,by=c("NOC"="NOC"))
sport $notes<-NULL
sport<- sport%>%group_by(Sport,region)%>%summarize(total=n())
sport
=c('Archery','Athletics','Basketball','Canoeing','Cycling','Equestrianism','Fencing','Gymnastics','Hockey','Rowing','Sailing','Shooting','Swimming','Table Tennis','Tennis','Weightlifting','Wrestling')
slist
<- filter(sport,Sport%in%slist,total>=1)
Sport
<- filter(age_red,Sport%in%slist)
age_red
<- ggplot(data = age_red) +
c geom_pointrange(
mapping = aes(x = Sport, y = Age, color=Sport),
stat = "summary",
fun.min = min,
fun.max = max,
fun = median
+
)
scale_color_viridis_d() +
coord_flip()+
facet_wrap(~Sex)+
theme_grey() +
#scale_x_discrete() +
scale_y_continuous(breaks=seq(10,100,by=5)) +
xlab("")+ylab("Alter")+
theme(legend.position = "none",
axis.text = element_text(size = 8,face="bold"),
plot.title = element_text(size=12,face = "bold")) #+
#ggtitle("Altersbandbreite aller Olympia-Athlet:innen in ausgewählten Sportarten")
ggplotly(c)
<- data_events%>%filter(!is.na(Age),Season=='Summer')%>%group_by(Sex,Age,Year)%>%summarize(pop=n())
df $Sex <- ifelse(df$Sex=="F","Female","Male")
df
<- df%>%
df mutate(athletes = pop*ifelse(Sex == "Female", -1, 1))
<- df %>%
series group_by(Sex, Age)%>%
do(data = list(sequence = .$athletes)) %>%
ungroup() %>%
group_by(Sex) %>%
do(data = .$data) %>%
mutate(name = Sex)%>%
list_parse()
<- max(abs(df$athletes))
maxpop
<- list(categories = sort(unique(df$Age)),
xaxis reversed = FALSE, tickInterval = 3,
labels = list(step= 3))
<- sort(unique(df$Year))
yrs
highchart() %>%
hc_chart(type = "bar") %>%
hc_motion(enabled = TRUE, labels =yrs, series = c(0,1), autoplay = TRUE, updateInterval = 4) %>%
hc_add_series_list(series) %>%
hc_colors(c("#1dd985", "#1836d9")) %>%
hc_plotOptions(
series = list(stacking = "normal"),
bar = list(groupPadding = 0, pointPadding = 0, borderWidth = 0)
%>%
) hc_tooltip(shared = TRUE) %>%
hc_yAxis(
min=-850,max=850)%>%
hc_xAxis(
xaxis,::list.merge(xaxis, list(opposite = TRUE, linkedTo = 0))
rlist%>%
) hc_tooltip(shared = FALSE,
formatter = JS("function () { return '<b>' + this.series.name + ', Age ' + this.point.category + '</b><br/>' + 'athletes: ' + Highcharts.numberFormat(Math.abs(this.point.y), 0);}")
%>%
) #hc_title(text = "Athlet:innen der Olympischen Sommerspiele von 1896 bis 2016 nach Geschlecht und Alter")%>%
#hc_subtitle(text = "Olympische Sommerspiele 1896 to 2016")
hc_add_theme(hc_theme_ggplot2())
<- data_events%>%filter(Year>1987,Sex=="M",Season=='Summer',Sport=="Swimming")%>%group_by(Year)%>%summarize(avg=round(mean(Age,na.rm = TRUE),1))
age_sps <- data_events%>%filter(Year>1987,Sex=="M",Season=='Summer',Sport=="Athletics")%>%group_by(Year)%>%summarize(avg=round(mean(Age,na.rm = TRUE),1))
age_spa <- data_events%>%filter(Year>1987,Sex=="M",Season=='Summer',Sport=="Gymnastics")%>%group_by(Year)%>%summarize(avg=round(mean(Age,na.rm = TRUE),1))
age_spg <- data_events%>%filter(Year>1987,Sex=="M",Season=='Summer',Sport=="Hockey")%>%group_by(Year)%>%summarize(avg=round(mean(Age,na.rm = TRUE),1))
age_spf <- data_events%>%filter(Year>1987,Sex=="M",Season=='Summer',Sport=="Tennis")%>%group_by(Year)%>%summarize(avg=round(mean(Age,na.rm = TRUE),1))
age_spt <- data_events%>%filter(Year>1987,Sex=="M",Season=='Summer',Sport=="Basketball")%>%group_by(Year)%>%summarize(avg=round(mean(Age,na.rm = TRUE),1))
age_spc
= data_events%>%filter(Year>1987,Season=='Summer')%>%group_by(Year)%>%summarize(Y=unique(Year))
age_y
<- age_y%>%left_join(age_sps, by=c("Y"="Year"))
age_sps <- age_y%>%left_join(age_spa, by=c("Y"="Year"))
age_spa <- age_y%>%left_join(age_spg, by=c("Y"="Year"))
age_spg <- age_y%>%left_join(age_spf, by=c("Y"="Year"))
age_spf <- age_y%>%left_join(age_spt, by=c("Y"="Year"))
age_spt <- age_y%>%left_join(age_spc, by=c("Y"="Year"))
age_spc
highchart(height = "700px") %>%
#hc_title(text = "Durchschnittsalter der männlichen Athlet:innen bei den Olympischen Sommerspielen ab 1988 in ausgewählten Sportarten") %>%
#hc_subtitle(text = "Olympische Sommerspiele ab 1988") %>%
#hc_credits(enabled = TRUE, text = "120 years of Olympic history: athletes and results", style = list(fontSize = "10px")) %>%
#hc_add_theme(hc_theme_flat()) %>%
hc_xAxis(categories = age_y$Y,title = list(text = "Jahr")) %>%
hc_add_series(name = "Swimming", color = "grey", data = age_sps$avg)%>%
hc_add_series(name = "Athletics", color = "#f9b4cb", data = age_spa$avg) %>%
hc_add_series(name = "Gymnastics", color = "lightgrey", data = age_spg$avg)%>%
hc_add_series(name = "Hockey", color = "#fa4f1e", data = age_spf$avg)%>%
hc_add_series(name = "Tennis",color = "#1836d9", data = age_spt$avg)%>%
hc_add_series(name = "Basketball", color = "#1dd985", data = age_spc$avg)%>%
hc_yAxis(title = list(text = "Altersdurchschnitt"),
labels = list(format = "{value}"), max = 30) %>%
hc_legend(enabled = T, layout = "vertical", align= "right", verticalAlign = "middle") %>%
hc_add_theme(hc_theme_ggplot2())
<- data_events%>%filter(Year>1987,Sex=="F",Season=='Summer',Sport=="Swimming")%>%group_by(Year)%>%summarize(avg=round(mean(Age,na.rm = TRUE),1))
age_sps <- data_events%>%filter(Year>1987,Sex=="F",Season=='Summer',Sport=="Athletics")%>%group_by(Year)%>%summarize(avg=round(mean(Age,na.rm = TRUE),1))
age_spa <- data_events%>%filter(Year>1987,Sex=="F",Season=='Summer',Sport=="Gymnastics")%>%group_by(Year)%>%summarize(avg=round(mean(Age,na.rm = TRUE),1))
age_spg <- data_events%>%filter(Year>1987,Sex=="F",Season=='Summer',Sport=="Hockey")%>%group_by(Year)%>%summarize(avg=round(mean(Age,na.rm = TRUE),1))
age_spf <- data_events%>%filter(Year>1987,Sex=="F",Season=='Summer',Sport=="Tennis")%>%group_by(Year)%>%summarize(avg=round(mean(Age,na.rm = TRUE),1))
age_spt <- data_events%>%filter(Year>1987,Sex=="F",Season=='Summer',Sport=="Basketball")%>%group_by(Year)%>%summarize(avg=round(mean(Age,na.rm = TRUE),1))
age_spc
= data_events%>%filter(Year>1987,Season=='Summer')%>%group_by(Year)%>%summarize(Y=unique(Year))
age_y
<- age_y%>%left_join(age_sps, by=c("Y"="Year"))
age_sps <- age_y%>%left_join(age_spa, by=c("Y"="Year"))
age_spa <- age_y%>%left_join(age_spg, by=c("Y"="Year"))
age_spg <- age_y%>%left_join(age_spf, by=c("Y"="Year"))
age_spf <- age_y%>%left_join(age_spt, by=c("Y"="Year"))
age_spt <- age_y%>%left_join(age_spc, by=c("Y"="Year"))
age_spc
highchart(height = "700px") %>%
#hc_title(text = "Durchschnittsalter der weiblichen Athlet:innen bei den Olympischen Sommerspielen ab 1988 in ausgewählten Sportarten") %>%
#hc_subtitle(text = "Olympische Sommerspiele ab 1988") %>%
#hc_credits(enabled = TRUE, text = "120 years of Olympic history: athletes and results", style = list(fontSize = "10px")) %>%
hc_xAxis(categories = age_y$Y,title = list(text = "Jahr")) %>%
hc_add_series(name = "Swimming", color = "grey", data = age_sps$avg)%>%
hc_add_series(name = "Athletics", color = "#f9b4cb", data = age_spa$avg) %>%
hc_add_series(name = "Gymnastics", color = "lightgrey", data = age_spg$avg)%>%
hc_add_series(name = "Hockey", color = "#fa4f1e", data = age_spf$avg)%>%
hc_add_series(name = "Tennis",color = "#1836d9", data = age_spt$avg)%>%
hc_add_series(name = "Basketball", color = "#1dd985", data = age_spc$avg)%>%
hc_yAxis(title = list(text = "Altersdurchschnitt"),
labels = list(format = "{value}"), max = 30) %>%
hc_legend(enabled = T, layout = "vertical", align= "right", verticalAlign = "middle")%>%
hc_add_theme(hc_theme_ggplot2())
Weitere Analysen, Ideen und Konzepte rund um Agediversity und Altersgemischte Teams erhaltet Ihr unter hellogoka.com oder kontaktiert uns. Vertiefte Analysen und Predictive Analytics unter Anwendung von Algorithmen für Machine Learning entwickeln wir für Euch und Euer Unternehmen zusammen mit unseren Kolleg:innen von Neopera.
[1] Rgriffin. “120 Years of Olympic History: Athletes and Results.” Kaggle, 15 June 2018.