Einleitung

Die folgende explorative Analyse rund um die Olympischen Sommerspiele basiert auf einem Datensatz, welcher auf der Plattform Kaggle - einer Online-Community von und für Datenwissenschaftler:innen - von User “rgriffin” unter dem Titel “120 years of Olympic History: Athletes and Results.”[1] zur Verfügung gestellt wurde. Der Hauptzweck von Kaggle ist die Organisation von Data-Science-Wettbewerben. Der bereitgestellte Datensatz umfasst alle Olympischen Sommer- und Winterspiele im Zeitraum 1896 bis 2016 und besteht aus 271.116 Sportler:innen mit 15 Attributen, d.h. ca. 4 Millionen Datenpunkten.

Das GOKA-Team hat hierbei den Fokus auf die Agediversity Dimension gelegt. Probiere die interaktiven Grafiken aus, um weitere Details zu erkunden.

Jüngste Olympia-Medaillengewinner:innen in ausgewählten Sportarten

sport <- data_events%>%filter(!is.na(Medal),!is.na(Year),Season=='Summer',Year>1890)%>%left_join(data_team,by=c("NOC"="NOC"))
sport$notes<-NULL
sport <- sport%>%group_by(Sport,region)%>%summarize(total=n())

slist =c('Archery','Athletics','Basketball','Canoeing','Cycling','Equestrianism','Fencing','Gymnastics','Hockey','Rowing','Sailing','Shooting','Swimming','Table Tennis','Tennis','Weightlifting','Wrestling')
sport <- filter(sport,Sport%in%slist,total>=1)

age_mi <- data_events%>%filter(!is.na(Medal),!is.na(Year),Season=='Summer',Year>1890)
age_mi <- age_mi%>%group_by(Sex,Sport)%>%summarize(Age=min(Age,na.rm = TRUE))
age_min <- data_events%>%filter(!is.na(Medal),Season=="Summer")%>%right_join(age_mi,by=c("Sex","Sport","Age"))

age_min <- filter(age_min,Sport%in%slist)

c <-ggplot(age_min,aes(Sport,Age, color=Sport,fill=Name)) +
  geom_point(position = "dodge",  width =.5,stat="identity") +
  scale_color_viridis_d() +
  coord_flip()+
  facet_wrap(~Sex)+
  theme_grey() + 
  
  #scale_x_discrete() +
  scale_y_continuous(breaks=seq(10,22,by=2)) +
  
  xlab("")+ylab("Alter")+
  theme(legend.position = "none",
  axis.text = element_text(size = 8,face="bold"),
  plot.title = element_text(size=12,face = "bold")) #+ 
  #ggtitle("Jüngste Olympia-Medaillengewinner:innen in ausgewählten Sportarten")

ggplotly(c)

Älteste Olympia-Medaillengewinner:innen in ausgewählten Sportarten

sport <- data_events%>%filter(!is.na(Medal),!is.na(Year),Season=='Summer',Year>1890)%>%left_join(data_team,by=c("NOC"="NOC"))
sport$notes<-NULL
sport <- sport%>%group_by(Sport,region)%>%summarize(total=n())

slist =c('Archery','Athletics','Basketball','Canoeing','Cycling','Equestrianism','Fencing','Gymnastics','Hockey','Rowing','Sailing','Shooting','Swimming','Table Tennis','Tennis','Weightlifting','Wrestling')
sport <- filter(sport,Sport%in%slist,total>=1)

age_mt <- data_events%>%filter(!is.na(Medal),Season=='Summer')
age_mt <-age_mt%>%group_by(Sex,Sport)%>%summarize(Age=max(Age,na.rm = TRUE))
age_max <- data_events%>%filter(!is.na(Medal),Season=="Summer")%>%right_join(age_mt,by=c("Sex","Sport","Age"))

age_max <- filter(age_max,Sport%in%slist)


c <-ggplot(age_max,aes(Sport,Age, color=Sport,fill=Name)) +
  geom_point(position = "dodge",  width =.5,stat="identity") +
  scale_color_viridis_d() +
  coord_flip()+
  facet_wrap(~Sex)+
  #theme_grey() + 
  #scale_x_discrete() +
  scale_y_continuous(breaks=seq(30,75,by=5)) +
  xlab("")+ ylab("Alter")+
  theme(legend.position = "none",
  axis.text = element_text(size = 8,face="bold"),
  plot.title = element_text(size=12,face = "bold")) #+ 
  #ggtitle("Älteste Olympia-Medaillengewinner:innen in ausgewählten Sportarten") 

ggplotly(c)

Altersbandbreite aller Olympia-Athlet:innen in ausgewählten Sportarten

age_red <- data_events%>%filter(Season=='Summer')
age_red <- age_red%>%group_by(Sex,Sport)%>%summarize(Age=quantile(Age,na.rm = TRUE))

age_dist <- data_events%>%filter(Season=="Summer")%>%right_join(age_red,by=c("Sex","Sport","Age"))

do.call("rbind",
        tapply(age_dist$Age,     # Specify numeric column
               age_dist$Sport,   # Specify group variable
               quantile))
##                       0%     25%   50%    75% 100%
## Aeronautics           26 26.0000 26.00 26.000   26
## Alpinism              22 29.7500 43.00 43.000   49
## Archery               14 21.0000 25.00 26.000   71
## Art Competitions      14 37.0000 45.00 45.000   97
## Athletics             12 22.0000 25.00 28.000   52
## Badminton             16 24.0000 25.00 27.500   44
## Baseball              16 23.0000 26.00 29.000   44
## Basketball            16 22.0000 25.00 28.000   40
## Basque Pelota         26 26.0000 26.00 26.000   26
## Beach Volleyball      18 27.0000 30.00 31.000   41
## Boxing                15 20.0000 23.00 25.000   41
## Canoeing              15 22.0000 25.00 28.000   48
## Cricket               21 24.0000 24.00 27.500   44
## Croquet               15 30.7500 36.00 46.000   58
## Cycling               14 21.0000 24.00 26.000   49
## Diving                12 20.0000 21.00 24.000   51
## Equestrianism         16 28.0000 34.00 40.000   72
## Fencing               14 24.0000 28.00 29.000   63
## Figure Skating        18 26.0000 26.75 36.000   45
## Football              15 21.0000 23.00 23.000   40
## Golf                  15 24.5000 30.00 37.000   50
## Gymnastics            10 18.0000 22.00 24.000   49
## Handball              14 24.0000 26.00 29.000   43
## Hockey                15 23.0000 25.00 28.000   44
## Ice Hockey            18 23.0000 25.00 29.000   46
## Jeu De Paume          19 29.0000 30.00 38.125   43
## Judo                  14 23.0000 25.00 28.000   43
## Lacrosse              14 24.0000 26.00 30.000   37
## Modern Pentathlon     15 23.0000 26.00 29.000   53
## Motorboating          26 26.0000 27.50 30.500   54
## Polo                  21 31.0000 35.00 39.000   53
## Racquets              19 24.7500 31.00 38.000   45
## Rhythmic Gymnastics   13 17.0000 18.00 18.000   30
## Roque                 37 48.0000 59.00 61.500   64
## Rowing                11 22.0000 25.00 25.000   60
## Rugby                 18 22.0000 24.00 26.000   39
## Rugby Sevens          18 23.1875 26.00 28.000   36
## Sailing               13 25.0000 29.00 29.000   71
## Shooting              15 28.0000 33.00 34.000   72
## Softball              17 23.0000 26.00 26.000   41
## Swimming              11 19.0000 19.00 22.000   46
## Synchronized Swimming 15 20.0000 22.00 25.000   40
## Table Tennis          15 23.0000 25.00 29.000   54
## Taekwondo             16 22.0000 24.00 26.000   37
## Tennis                13 23.0000 26.00 28.000   47
## Trampolining          18 22.0000 24.00 27.000   39
## Triathlon             18 25.0000 28.00 30.000   42
## Tug-Of-War            17 25.2500 29.00 33.000   45
## Volleyball            15 23.0000 24.00 27.000   41
## Water Polo            14 22.0000 25.00 28.000   45
## Weightlifting         15 23.0000 25.00 27.000   45
## Wrestling             15 23.0000 25.00 28.000   50
sport <- data_events%>%filter(!is.na(Year),Season=='Summer',Year>1890)%>%left_join(data_team,by=c("NOC"="NOC"))
sport$notes<-NULL
sport <- sport%>%group_by(Sport,region)%>%summarize(total=n())

slist =c('Archery','Athletics','Basketball','Canoeing','Cycling','Equestrianism','Fencing','Gymnastics','Hockey','Rowing','Sailing','Shooting','Swimming','Table Tennis','Tennis','Weightlifting','Wrestling')

Sport <- filter(sport,Sport%in%slist,total>=1)

age_red <- filter(age_red,Sport%in%slist)

c <- ggplot(data = age_red) +
  geom_pointrange(
    mapping = aes(x = Sport, y = Age, color=Sport),
    stat = "summary",
    fun.min = min,
    fun.max = max,
    fun = median
  ) +

scale_color_viridis_d() +
  coord_flip()+  
  facet_wrap(~Sex)+
  
  theme_grey() + 
  #scale_x_discrete() +
  scale_y_continuous(breaks=seq(10,100,by=5)) +
  
  xlab("")+ylab("Alter")+
  theme(legend.position = "none",
  axis.text = element_text(size = 8,face="bold"),
  plot.title = element_text(size=12,face = "bold")) #+ 
  #ggtitle("Altersbandbreite aller Olympia-Athlet:innen in ausgewählten Sportarten")

ggplotly(c)

Dynamische Altersstruktur der Olympia-Athlet:innen (1896-2016)

df <- data_events%>%filter(!is.na(Age),Season=='Summer')%>%group_by(Sex,Age,Year)%>%summarize(pop=n())
df$Sex <- ifelse(df$Sex=="F","Female","Male")

df <- df%>%
  mutate(athletes = pop*ifelse(Sex == "Female", -1, 1))

series <- df %>% 
  group_by(Sex, Age)%>%
  do(data = list(sequence = .$athletes)) %>% 
  ungroup() %>% 
  group_by(Sex) %>% 
  do(data = .$data) %>%
  mutate(name = Sex)%>%
 list_parse()

maxpop <- max(abs(df$athletes))

xaxis <- list(categories = sort(unique(df$Age)),
              reversed = FALSE, tickInterval = 3,
              labels = list(step= 3))

yrs <-  sort(unique(df$Year))

highchart() %>%
  hc_chart(type = "bar") %>%
  hc_motion(enabled = TRUE, labels =yrs, series = c(0,1), autoplay = TRUE, updateInterval = 4) %>% 
  hc_add_series_list(series) %>% 
  hc_colors(c("#1dd985", "#1836d9")) %>% 
  hc_plotOptions(
    series = list(stacking = "normal"),
    bar = list(groupPadding = 0, pointPadding =  0, borderWidth = 0)
  ) %>% 
  hc_tooltip(shared = TRUE) %>% 
  hc_yAxis(
    min=-850,max=850)%>% 
    hc_xAxis(
    xaxis,
    rlist::list.merge(xaxis, list(opposite = TRUE, linkedTo = 0))
  ) %>% 
  hc_tooltip(shared = FALSE,
             formatter = JS("function () { return '<b>' + this.series.name + ', Age ' + this.point.category + '</b><br/>' + 'athletes: ' + Highcharts.numberFormat(Math.abs(this.point.y), 0);}")
             
  ) %>%
  #hc_title(text = "Athlet:innen der Olympischen Sommerspiele von 1896 bis 2016 nach Geschlecht und Alter")%>%
  
#hc_subtitle(text = "Olympische Sommerspiele 1896 to 2016")
hc_add_theme(hc_theme_ggplot2())

Durchschnittsalter der Athlet:innen bei den Olympischen Sommerspielen ab 1988 in ausgewählten Sportarten

age_sps <- data_events%>%filter(Year>1987,Sex=="M",Season=='Summer',Sport=="Swimming")%>%group_by(Year)%>%summarize(avg=round(mean(Age,na.rm = TRUE),1))
age_spa <- data_events%>%filter(Year>1987,Sex=="M",Season=='Summer',Sport=="Athletics")%>%group_by(Year)%>%summarize(avg=round(mean(Age,na.rm = TRUE),1))
age_spg <- data_events%>%filter(Year>1987,Sex=="M",Season=='Summer',Sport=="Gymnastics")%>%group_by(Year)%>%summarize(avg=round(mean(Age,na.rm = TRUE),1))
age_spf <- data_events%>%filter(Year>1987,Sex=="M",Season=='Summer',Sport=="Hockey")%>%group_by(Year)%>%summarize(avg=round(mean(Age,na.rm = TRUE),1))
age_spt <- data_events%>%filter(Year>1987,Sex=="M",Season=='Summer',Sport=="Tennis")%>%group_by(Year)%>%summarize(avg=round(mean(Age,na.rm = TRUE),1))
age_spc <- data_events%>%filter(Year>1987,Sex=="M",Season=='Summer',Sport=="Basketball")%>%group_by(Year)%>%summarize(avg=round(mean(Age,na.rm = TRUE),1))

age_y = data_events%>%filter(Year>1987,Season=='Summer')%>%group_by(Year)%>%summarize(Y=unique(Year))

age_sps <- age_y%>%left_join(age_sps, by=c("Y"="Year"))
age_spa <- age_y%>%left_join(age_spa, by=c("Y"="Year"))
age_spg <- age_y%>%left_join(age_spg, by=c("Y"="Year"))
age_spf <- age_y%>%left_join(age_spf, by=c("Y"="Year"))
age_spt <- age_y%>%left_join(age_spt, by=c("Y"="Year"))
age_spc <- age_y%>%left_join(age_spc, by=c("Y"="Year"))


highchart(height = "700px") %>% 
  #hc_title(text = "Durchschnittsalter der männlichen Athlet:innen bei den Olympischen Sommerspielen ab 1988 in ausgewählten Sportarten") %>%
  #hc_subtitle(text = "Olympische Sommerspiele ab 1988") %>%
  #hc_credits(enabled = TRUE, text = "120 years of Olympic history: athletes and results", style = list(fontSize = "10px")) %>%
  
  #hc_add_theme(hc_theme_flat()) %>%
  
  hc_xAxis(categories = age_y$Y,title = list(text = "Jahr")) %>% 
  
  hc_add_series(name = "Swimming", color = "grey", data = age_sps$avg)%>% 
  hc_add_series(name = "Athletics", color = "#f9b4cb", data = age_spa$avg) %>% 
  hc_add_series(name = "Gymnastics", color = "lightgrey", data = age_spg$avg)%>%
  hc_add_series(name = "Hockey", color = "#fa4f1e", data = age_spf$avg)%>%
  hc_add_series(name = "Tennis",color = "#1836d9", data = age_spt$avg)%>%
  hc_add_series(name = "Basketball", color = "#1dd985", data = age_spc$avg)%>%
  
  hc_yAxis(title = list(text = "Altersdurchschnitt"),
           labels = list(format = "{value}"), max = 30) %>%
  hc_legend(enabled = T,  layout = "vertical", align= "right", verticalAlign = "middle") %>%
  hc_add_theme(hc_theme_ggplot2())
age_sps <- data_events%>%filter(Year>1987,Sex=="F",Season=='Summer',Sport=="Swimming")%>%group_by(Year)%>%summarize(avg=round(mean(Age,na.rm = TRUE),1))
age_spa <- data_events%>%filter(Year>1987,Sex=="F",Season=='Summer',Sport=="Athletics")%>%group_by(Year)%>%summarize(avg=round(mean(Age,na.rm = TRUE),1))
age_spg <- data_events%>%filter(Year>1987,Sex=="F",Season=='Summer',Sport=="Gymnastics")%>%group_by(Year)%>%summarize(avg=round(mean(Age,na.rm = TRUE),1))
age_spf <- data_events%>%filter(Year>1987,Sex=="F",Season=='Summer',Sport=="Hockey")%>%group_by(Year)%>%summarize(avg=round(mean(Age,na.rm = TRUE),1))
age_spt <- data_events%>%filter(Year>1987,Sex=="F",Season=='Summer',Sport=="Tennis")%>%group_by(Year)%>%summarize(avg=round(mean(Age,na.rm = TRUE),1))
age_spc <- data_events%>%filter(Year>1987,Sex=="F",Season=='Summer',Sport=="Basketball")%>%group_by(Year)%>%summarize(avg=round(mean(Age,na.rm = TRUE),1))

age_y = data_events%>%filter(Year>1987,Season=='Summer')%>%group_by(Year)%>%summarize(Y=unique(Year))

age_sps <- age_y%>%left_join(age_sps, by=c("Y"="Year"))
age_spa <- age_y%>%left_join(age_spa, by=c("Y"="Year"))
age_spg <- age_y%>%left_join(age_spg, by=c("Y"="Year"))
age_spf <- age_y%>%left_join(age_spf, by=c("Y"="Year"))
age_spt <- age_y%>%left_join(age_spt, by=c("Y"="Year"))
age_spc <- age_y%>%left_join(age_spc, by=c("Y"="Year"))


highchart(height = "700px") %>% 
#hc_title(text = "Durchschnittsalter der weiblichen Athlet:innen bei den Olympischen Sommerspielen ab 1988 in ausgewählten Sportarten") %>%
#hc_subtitle(text = "Olympische Sommerspiele ab 1988") %>%
#hc_credits(enabled = TRUE, text = "120 years of Olympic history: athletes and results", style = list(fontSize = "10px")) %>%
  
hc_xAxis(categories = age_y$Y,title = list(text = "Jahr")) %>% 

hc_add_series(name = "Swimming", color = "grey", data = age_sps$avg)%>% 
hc_add_series(name = "Athletics", color = "#f9b4cb", data = age_spa$avg) %>% 
hc_add_series(name = "Gymnastics", color = "lightgrey", data = age_spg$avg)%>%
hc_add_series(name = "Hockey", color = "#fa4f1e", data = age_spf$avg)%>%
hc_add_series(name = "Tennis",color = "#1836d9", data = age_spt$avg)%>%
hc_add_series(name = "Basketball", color = "#1dd985", data = age_spc$avg)%>%
  
hc_yAxis(title = list(text = "Altersdurchschnitt"),
           labels = list(format = "{value}"), max = 30) %>%
hc_legend(enabled = T,  layout = "vertical", align= "right", verticalAlign = "middle")%>%
hc_add_theme(hc_theme_ggplot2())

Zusammenfassung

Weitere Analysen, Ideen und Konzepte rund um Agediversity und Altersgemischte Teams erhaltet Ihr unter hellogoka.com oder kontaktiert uns. Vertiefte Analysen und Predictive Analytics unter Anwendung von Algorithmen für Machine Learning entwickeln wir für Euch und Euer Unternehmen zusammen mit unseren Kolleg:innen von Neopera.

Literatur

[1] Rgriffin. “120 Years of Olympic History: Athletes and Results.” Kaggle, 15 June 2018.