Cleaning

Set Up

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Olympics Data

Data Import

df <- read_csv("data/olympics_dataset.csv")
Rows: 252565 Columns: 11
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (9): Name, Sex, Team, NOC, Season, City, Sport, Event, Medal
dbl (2): player_id, Year

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Data Inspect

colnames(df)
 [1] "player_id" "Name"      "Sex"       "Team"      "NOC"       "Year"     
 [7] "Season"    "City"      "Sport"     "Event"     "Medal"    

Data Cleaning

Rename Data

df_clean <- df |>
  rename(country = `Team`,
         name = `Name`,
         sex = `Sex`,
         year = `Year`,
         medal = `Medal`,
         city = `City`,
         sport = `Sport`,
         event = `Event`) |>
  select(name, sex, country, year, medal, city, sport, event)
glimpse(df_clean)
Rows: 252,565
Columns: 8
$ name    <chr> "A Dijiang", "A Lamusi", "Gunnar Aaby", "Edgar Aabye", "Cornel…
$ sex     <chr> "M", "M", "M", "M", "F", "F", "M", "M", "F", "F", "M", "M", "M…
$ country <chr> "China", "China", "Denmark", "Denmark/Sweden", "Netherlands", …
$ year    <dbl> 1992, 2012, 1920, 1900, 1932, 1932, 1952, 2000, 1996, 2000, 19…
$ medal   <chr> "No medal", "No medal", "No medal", "Gold", "No medal", "No me…
$ city    <chr> "Barcelona", "London", "Antwerpen", "Paris", "Los Angeles", "L…
$ sport   <chr> "Basketball", "Judo", "Football", "Tug-Of-War", "Athletics", "…
$ event   <chr> "Basketball Men's Basketball", "Judo Men's Extra-Lightweight",…

Remove Incorrect Data

# Note: There was no summer Olympics in 1916. 
# Also, the 1956 Olympics was held in Melbourne instead of Stockholm.
df_clean2 <- df_clean |>
  filter(year != 1916,
         !(year == 1956 & city == "Stockholm"))
glimpse(df_clean2)
Rows: 252,267
Columns: 8
$ name    <chr> "A Dijiang", "A Lamusi", "Gunnar Aaby", "Edgar Aabye", "Cornel…
$ sex     <chr> "M", "M", "M", "M", "F", "F", "M", "M", "F", "F", "M", "M", "M…
$ country <chr> "China", "China", "Denmark", "Denmark/Sweden", "Netherlands", …
$ year    <dbl> 1992, 2012, 1920, 1900, 1932, 1932, 1952, 2000, 1996, 2000, 19…
$ medal   <chr> "No medal", "No medal", "No medal", "Gold", "No medal", "No me…
$ city    <chr> "Barcelona", "London", "Antwerpen", "Paris", "Los Angeles", "L…
$ sport   <chr> "Basketball", "Judo", "Football", "Tug-Of-War", "Athletics", "…
$ event   <chr> "Basketball Men's Basketball", "Judo Men's Extra-Lightweight",…

Data Save

save(df_clean2,file = "data/df_clean.RData")

GDP Data

Data Import

gdp <- read.csv("data/gdp.csv")

Data Inspect

glimpse(gdp)
Rows: 218
Columns: 66
$ Country.Name <chr> "Aruba", "Afghanistan", "Angola", "Albania", "Andorra", "…
$ Country.Code <chr> "ABW", "AFG", "AGO", "ALB", "AND", "ARE", "ARG", "ARM", "…
$ X1960        <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 18607682977, 6624…
$ X1961        <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 19684003149, 7346…
$ X1962        <dbl> NA, NA, NA, NA, NA, NA, 24450604878, NA, NA, NA, 19923683…
$ X1963        <dbl> NA, NA, NA, NA, NA, NA, 18272123664, NA, NA, NA, 21540963…
$ X1964        <dbl> NA, NA, NA, NA, NA, NA, 25605249382, NA, NA, NA, 23801123…
$ X1965        <dbl> NA, NA, NA, NA, NA, NA, 28344705967, NA, NA, NA, 25977284…
$ X1966        <dbl> NA, NA, NA, NA, NA, NA, 28630474728, NA, NA, NA, 27308964…
$ X1967        <dbl> NA, NA, NA, NA, NA, NA, 24256667553, NA, NA, NA, 30442724…
$ X1968        <dbl> NA, NA, NA, NA, NA, NA, 26436857247, NA, NA, NA, 32714085…
$ X1969        <dbl> NA, NA, NA, NA, NA, NA, 31256284544, NA, NA, NA, 36684485…
$ X1970        <dbl> NA, NA, NA, NA, 78617711, 685986701, 31584210366, NA, NA,…
$ X1971        <dbl> NA, NA, NA, NA, 89406608, 939893600, 33293199095, NA, NA,…
$ X1972        <dbl> NA, NA, NA, NA, 113414397, 1415086929, 34733000536, NA, N…
$ X1973        <dbl> NA, NA, NA, NA, 150841639, 4231243616, 52544000117, NA, N…
$ X1974        <dbl> NA, NA, NA, NA, 186557082, 11651505689, 72436777342, NA, …
$ X1975        <dbl> NA, NA, NA, NA, 220112572, 14720728249, 52438647922, NA, …
$ X1976        <dbl> NA, NA, NA, NA, 227283851, 19213158779, 51169499892, NA, …
$ X1977        <dbl> NA, NA, NA, NA, 253997897, 24871775165, 56781000101, NA, …
$ X1978        <dbl> NA, NA, NA, NA, 308020261, 23775764225, 89049453088, NA, …
$ X1979        <dbl> NA, NA, NA, NA, 411548748, 31225659621, 69252328952, NA, …
$ X1980        <dbl> NA, NA, 5930503401, 1578102105, 446377777, 43599160050, 7…
$ X1981        <dbl> NA, NA, 5550483036, 1808177156, 388983266, 49333424135, 7…
$ X1982        <dbl> NA, NA, 5550483036, 1861163170, 375914744, 46622718605, 8…
$ X1983        <dbl> NA, NA, 5784341596, 1881412587, 327850043, 42803323345, 1…
$ X1984        <dbl> NA, NA, 6131475065, 1857337995, 330073081, 41807954236, 1…
$ X1985        <dbl> NA, NA, 7554065410, 1897050117, 346742715, 40603650232, 8…
$ X1986        <dbl> 405586592, NA, 7072536109, 2097326250, 481996013, 3394361…
$ X1987        <dbl> 487709497, NA, 8084412414, 2080796250, 611299925, 3638490…
$ X1988        <dbl> 596648045, NA, 8769836769, 2051236250, 721425939, 3627567…
$ X1989        <dbl> 695530726, NA, 10201780977, 2253090000, 795489582, 414649…
$ X1990        <dbl> 764804469, NA, 11229515599, 2028553750, 1028989692, 50701…
$ X1991        <dbl> 872067039, NA, 10603784541, 1099559028, 1106891362, 51552…
$ X1992        <dbl> 958659218, NA, 8307810974, 652174991, 1209992020, 5423917…
$ X1993        <dbl> 1083240223, NA, 5768720422, 1185315468, 1007090270, 55625…
$ X1994        <dbl> 1245810056, NA, 4438321017, 1880950864, 1017544069, 59305…
$ X1995        <dbl> 1320670391, NA, 5538749260, 2392764853, 1178745283, 65743…
$ X1996        <dbl> 1379888268, NA, 7526446606, 3199640815, 1224024139, 73571…
$ X1997        <dbl> 1531843575, NA, 7648377413, 2258513974, 1180645572, 78839…
$ X1998        <dbl> 1665363128, NA, 6506229607, 2545964541, 1211953994, 75674…
$ X1999        <dbl> 1722905028, NA, 6152923310, 3212121651, 1240295199, 84445…
$ X2000        <dbl> 1873452514, 3521418060, 9129594970, 3480355258, 143260618…
$ X2001        <dbl> 1896456983, 2813571754, 8936079118, 3922100794, 154826580…
$ X2002        <dbl> 1961843575, 3825701439, 15285592370, 4348068242, 17642798…
$ X2003        <dbl> 2044111732, 4520946819, 17812704586, 5611496257, 23669418…
$ X2004        <dbl> 2254830726, 5224896719, 23552057679, 7184685782, 29002453…
$ X2005        <dbl> 2360017318, 6203256539, 36970900884, 8052077248, 31610840…
$ X2006        <dbl> 2469782682, 6971758282, 52381025141, 8896075005, 34593379…
$ X2007        <dbl> 2677641341, 9747886187, 65266415494, 10677324853, 3957625…
$ X2008        <dbl> 2.843025e+09, 1.010930e+10, 8.853867e+10, 1.288135e+10, 4…
$ X2009        <dbl> 2553793296, 12416152732, 70307196182, 12044205550, 368897…
$ X2010        <dbl> 2.453597e+09, 1.585667e+10, 8.379947e+10, 1.192693e+10, 3…
$ X2011        <dbl> 2.637859e+09, 1.780510e+10, 1.117897e+11, 1.289076e+10, 3…
$ X2012        <dbl> 2.615208e+09, 1.990733e+10, 1.280529e+11, 1.231983e+10, 3…
$ X2013        <dbl> 2.727850e+09, 2.014642e+10, 1.323391e+11, 1.277622e+10, 3…
$ X2014        <dbl> 2.790850e+09, 2.049713e+10, 1.359668e+11, 1.322815e+10, 3…
$ X2015        <dbl> 2.962907e+09, 1.913422e+10, 9.049642e+10, 1.138685e+10, 2…
$ X2016        <dbl> 2.983635e+09, 1.811657e+10, 5.276162e+10, 1.186120e+10, 2…
$ X2017        <dbl> 3.092429e+09, 1.875346e+10, 7.369015e+10, 1.301973e+10, 3…
$ X2018        <dbl> 3.276184e+09, 1.805322e+10, 7.945069e+10, 1.537951e+10, 3…
$ X2019        <dbl> 3.395799e+09, 1.879944e+10, 7.089796e+10, 1.558511e+10, 3…
$ X2020        <dbl> 2.481857e+09, 1.995593e+10, 4.850156e+10, 1.524146e+10, 2…
$ X2021        <dbl> 2.929447e+09, 1.426000e+10, 6.650513e+10, 1.803201e+10, 3…
$ X2022        <dbl> 3.279344e+09, 1.449724e+10, 1.043997e+11, 1.901724e+10, 3…
$ X2023        <dbl> 3.648573e+09, 1.723305e+10, 8.482465e+10, 2.354718e+10, 3…

Data Cleaning

Transform wide data to long data format

gdp_long <- gdp |> 
  rename(country = `Country.Name`) |>
  pivot_longer(cols = 3:66, 
               names_to = "year", 
               values_to = "gdp") |> 
  mutate(year = substr(year,2,5)) |>
  mutate(year = as.numeric(year)) |>
  select(country, year, gdp)
glimpse(gdp_long)
Rows: 13,952
Columns: 3
$ country <chr> "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba",…
$ year    <dbl> 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 19…
$ gdp     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…

Data Save

save(gdp_long,file = "data/gdp_long.RData")