Data Structures and Data Frames

Preflight

  1. Ensure that RStudio/R is open with the appropriate packages installed (tidyverse)

Learning Questions


Learning Objectives


Data Types and Structures in R

cats <- data.frame(coat = c("calico", "black", "tabby"),
                    weight = c(2.1, 5.0, 3.2),
                    likes_catnip = c(1, 0, 1))
write.csv(cats, file = "data/feline-data.csv", row.names = FALSE)
cats <- read.csv(file = "data/feline-data.csv")
cats
    coat weight likes_catnip
1 calico    2.1            1
2  black    5.0            0
3  tabby    3.2            1
> cats$weight
[1] 2.1 5.0 3.2
> cats$coat
[1] "calico" "black"  "tabby" 
> cats$weight + 2
[1] 4.1 7.0 5.2
> paste("My cat is", cats$coat)
[1] "My cat is calico" "My cat is black"  "My cat is tabby" 
> cats$weight + cats$coat
Error in cats$weight + cats$coat : 
  non-numeric argument to binary operator

What Data Types Do You Expect?


Data Types in R

> typeof(cats$weight)
[1] "double"
> typeof(TRUE)
[1] "logical"
> typeof(3.14)
[1] "double"
> typeof(1)
[1] "double"
> typeof(1L)
[1] "integer"
> typeof(cats$coat)
[1] "character"
> typeof("Irn Bru")
[1] "character"
> additional_cat <- data.frame(coat = "tabby", weight = "2.3 or 2.4", likes_catnip = 1)
> additional_cat
   coat     weight likes_catnip
1 tabby 2.3 or 2.4   
> cats2 <- rbind(cats, additional_cat)
> cats2
    coat     weight likes_catnip
1 calico        2.1            1
2  black          5            0
3  tabby        3.2            1
4  tabby 2.3 or 2.4            1
> typeof(cats$weight)
[1] "double"
> typeof(cats2$weight)
[1] "character"
> cats2$weight + 2
Error in cats2$weight + 2 : non-numeric argument to binary operator

A Quick Note About Dataframes

> str(cats)
'data.frame': 3 obs. of  3 variables:
 $ coat        : chr  "calico" "black" "tabby"
 $ weight      : num  2.1 5 3.2
 $ likes_catnip: int  1 0 1
> str(cats2)
'data.frame': 4 obs. of  3 variables:
 $ coat        : chr  "calico" "black" "tabby" "tabby"
 $ weight      : chr  "2.1" "5" "3.2" "2.3 or 2.4"
 $ likes_catnip: num  1 0 1 1

Vectors

> x <- c(10, 12, 45, 33)
> x
[1] 10 12 45 33
> typeof(x)
[1] "double"
> length(x)
[1] 4
> str(x)
 num [1:4] 10 12 45 33
> str(cats$coat)
 chr [1:3] "calico" "black" "tabby"

Coercion (1)

> quiz_vector <- c(2,6,'3')
> typeof(quiz_vector)
[1] "character"
> coercion_vector <- c('a', TRUE)
> coercion_vector
[1] "a"    "TRUE"
> another_coercion_vector <- c(0, TRUE)
> another_coercion_vector
[1] 0 1
> typeof(coercion_vector)
[1] "character"
> typeof(another_coercion_vector)
[1] "double"

Coercion (2)

> x
[1] 10 12 45 33
> as.character(x)
[1] "10" "12" "45" "33"
> as.complex(x)
[1] 10+0i 12+0i 45+0i 33+0i
> as.logical(x)
[1] TRUE TRUE TRUE TRUE
> cats
    coat weight likes_catnip
1 calico    2.1            1
2  black    5.0            0
3  tabby    3.2            1
> typeof(cats$likes_catnip)
[1] "integer"
> cats$likes_catnip <- as.logical(cats$likes_catnip)
> cats
    coat weight likes_catnip
1 calico    2.1         TRUE
2  black    5.0        FALSE
3  tabby    3.2         TRUE
> str(cats)
'data.frame': 3 obs. of  3 variables:
 $ coat        : chr  "calico" "black" "tabby"
 $ weight      : num  2.1 5 3.2
 $ likes_catnip: logi  TRUE FALSE TRUE

Lists

> l <- list(1, 'a', TRUE, seq(2, 5))
> length(l)
[1] 4
> l
[[1]]
[1] 1

[[2]]
[1] "a"

[[3]]
[1] TRUE

[[4]]
[1] 2 3 4 5
> l[[1]]
[1] 1
> l[[4]]
[1] 2 3 4 5
> str(l)
List of 4
 $ : num 1
 $ : chr "a"
 $ : logi TRUE
 $ : int [1:4] 2 3 4 5
> l_named <- list(a = "SWC", b = 1:4)
> l_named
$a
[1] "SWC"

$b
[1] 1 2 3 4
> l_named$a
[1] "SWC"
> l_named$b
[1] 1 2 3 4
> l_named[[1]]
[1] "SWC"
> l_named[[2]]
[1] 1 2 3 4
> str(l_named)
List of 2
 $ a: chr "SWC"
 $ b: int [1:4] 1 2 3 4

Let’s Look At A Data Frame

> cats
    coat weight likes_catnip
1 calico    2.1         TRUE
2  black    5.0        FALSE
3  tabby    3.2         TRUE
> typeof(cats)
[1] "list"
> cats[[2]]
[1] 2.1 5.0 3.2
> typeof(cats$weight)
[1] "double"
> class(cats)
[1] "data.frame"
> class(l)
[1] "list"

Load Episode Data

# Load gapminder data from a local file
gapminder <- read.table("data/gapminder_data.csv", sep=",", header=TRUE)

Investigating gapminder

> str(gapminder)
'data.frame': 1704 obs. of  6 variables:
 $ country  : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ year     : int  1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
 $ pop      : num  8425333 9240934 10267083 11537966 13079460 ...
 $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
 $ lifeExp  : num  28.8 30.3 32 34 36.1 ...
 $ gdpPercap: num  779 821 853 836 740 ...
> summary(gapminder)
        country          year           pop               continent      lifeExp
 Afghanistan:  12   Min.   :1952   Min.   :6.001e+04   Africa  :624   Min.   :23.60
 Albania    :  12   1st Qu.:1966   1st Qu.:2.794e+06   Americas:300   1st Qu.:48.20
 Algeria    :  12   Median :1980   Median :7.024e+06   Asia    :396   Median :60.71
 Angola     :  12   Mean   :1980   Mean   :2.960e+07   Europe  :360   Mean   :59.47
 Argentina  :  12   3rd Qu.:1993   3rd Qu.:1.959e+07   Oceania : 24   3rd Qu.:70.85
 Australia  :  12   Max.   :2007   Max.   :1.319e+09                  Max.   :82.60
 (Other)    :1632
   gdpPercap
 Min.   :   241.2
 1st Qu.:  1202.1
 Median :  3531.8
 Mean   :  7215.3
 3rd Qu.:  9325.5
 Max.   :113523.1
> typeof(gapminder$year)
[1] "integer"
> typeof(gapminder$country)
[1] "integer"
> str(gapminder$country)
 Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
> dim(gapminder)
[1] 1704    6
> length(gapminder)
[1] 6
> nrow(gapminder)
[1] 1704
> ncol(gapminder)
[1] 6
> colnames(gapminder)
[1] "country"   "year"      "pop"       "continent" "lifeExp"   "gdpPercap"
> head(gapminder)
      country year      pop continent lifeExp gdpPercap
1 Afghanistan 1952  8425333      Asia  28.801  779.4453
2 Afghanistan 1957  9240934      Asia  30.332  820.8530
3 Afghanistan 1962 10267083      Asia  31.997  853.1007
4 Afghanistan 1967 11537966      Asia  34.020  836.1971
5 Afghanistan 1972 13079460      Asia  36.088  739.9811

Data Frame Manipulation With dplyr


Learning Objectives (Data Frames)


What and Why is dplyr?


Split-Apply-Combine


select() - Interactive Demo**

> library(dplyr)
> head(select(gapminder, year, country, gdpPercap))
  year     country gdpPercap
1 1952 Afghanistan  779.4453
2 1957 Afghanistan  820.8530
3 1962 Afghanistan  853.1007
4 1967 Afghanistan  836.1971
5 1972 Afghanistan  739.9811
6 1977 Afghanistan  786.1134
> gapminder %>% select(year, country, gdpPercap) %>% head()
  year     country gdpPercap
1 1952 Afghanistan  779.4453
2 1957 Afghanistan  820.8530
3 1962 Afghanistan  853.1007
4 1967 Afghanistan  836.1971
5 1972 Afghanistan  739.9811
6 1977 Afghanistan  786.1134

filter()

> head(filter(gapminder, continent=="Europe"))
  country year     pop continent lifeExp gdpPercap
1 Albania 1952 1282697    Europe   55.23  1601.056
2 Albania 1957 1476505    Europe   59.28  1942.284
3 Albania 1962 1728137    Europe   64.82  2312.889
4 Albania 1967 1984060    Europe   66.22  2760.197
5 Albania 1972 2263554    Europe   67.69  3313.422
6 Albania 1977 2509048    Europe   68.93  3533.004
# Select gdpPercap by country and year, only for Europe
eurodata <- gapminder %>%
              filter(continent == "Europe") %>%
              select(year, country, gdpPercap)

Challenge

# Select life expectancy by country and year, only for Africa
> afrodata <- gapminder %>%
  filter(continent == "Africa") %>%
  select(year, country, lifeExp)
> nrow(afrodata)
[1] 624


group_by()

> group_by(gapminder, continent)
# A tibble: 1,704 x 6
# Groups:   continent [5]
       country  year      pop continent lifeExp gdpPercap
        <fctr> <int>    <dbl>    <fctr>   <dbl>     <dbl>
 1 Afghanistan  1952  8425333      Asia  28.801  779.4453
 2 Afghanistan  1957  9240934      Asia  30.332  820.8530
 3 Afghanistan  1962 10267083      Asia  31.997  853.1007
 4 Afghanistan  1967 11537966      Asia  34.020  836.1971
 5 Afghanistan  1972 13079460      Asia  36.088  739.9811
 6 Afghanistan  1977 14880372      Asia  38.438  786.1134
 7 Afghanistan  1982 12881816      Asia  39.854  978.0114
 8 Afghanistan  1987 13867957      Asia  40.822  852.3959
 9 Afghanistan  1992 16317921      Asia  41.674  649.3414
10 Afghanistan  1997 22227415      Asia  41.763  635.3414
# ... with 1,694 more rows

summarize()

> # Produce table of mean GDP by continent
> gapminder %>%
+     group_by(continent) %>%
+     summarize(meangdpPercap=mean(gdpPercap))
# A tibble: 5 x 2
  continent meangdpPercap
     <fctr>         <dbl>
1    Africa      2193.755
2  Americas      7136.110
3      Asia      7902.150
4    Europe     14469.476
5   Oceania     18621.609

Challenge 13

# Find average life expectancy by nation
avg_lifexp_country <- gapminder %>%
  group_by(country) %>%
  summarize(meanlifeExp=mean(lifeExp))
> avg_lifexp_country %>% filter(meanlifeExp == min(meanlifeExp))
# A tibble: 1 × 2
  country      meanlifeExp
  <chr>              <dbl>
1 Sierra Leone        36.8
> avg_lifexp_country %>% filter(meanlifeExp == max(meanlifeExp))
# A tibble: 1 × 2
  country meanlifeExp
  <chr>         <dbl>
1 Iceland        76.5


count() and n()

> gapminder %>% filter(year == 2002) %>% count(continent, sort = TRUE)
# A tibble: 5 x 2
  continent     n
     <fctr> <int>
1    Africa    52
2      Asia    33
3    Europe    30
4  Americas    25
5   Oceania     2
> gapminder %>% group_by(continent) %>% summarize(se_lifeExp = sd(lifeExp)/sqrt(n()))
# A tibble: 5 x 2
  continent se_lifeExp
     <fctr>      <dbl>
1    Africa  0.3663016
2  Americas  0.5395389
3      Asia  0.5962151
4    Europe  0.2863536
5   Oceania  0.7747759

mutate()

# Calculate GDP in $billion
gdp_bill <- gapminder %>%
  mutate(gdp_billion = gdpPercap * pop / 10^9)
# Calculate total/sd of GDP by continent and year
gdp_bycontinents_byyear <- gapminder %>%
  mutate(gdp_billion=gdpPercap*pop/10^9) %>%
  group_by(continent,year) %>%
  summarize(mean_gdpPercap=mean(gdpPercap),
            sd_gdpPercap=sd(gdpPercap),
            mean_gdp_billion=mean(gdp_billion),
            sd_gdp_billion=sd(gdp_billion))

ifelse()

gdp_billion_large_countries <- gapminder %>%
  mutate(gdp_billion_large = ifelse(pop > 10e6, 
                                    gdpPercap * pop / 10^9,
                                    NA))
gdp_future_bycontinents_byyear_high_lifeExp <- gapminder %>%
    mutate(gdp_futureExpectation = ifelse(lifeExp > 40, gdpPercap * 1.5, gdpPercap)) %>%
    group_by(continent, year) %>%
    summarize(mean_gdpPercap = mean(gdpPercap),
              mean_gdpPercap_expected = mean(gdp_futureExpectation))

Tidy Data


Why Tidy Data?


An Untidy Dataset (1)


An Untidy Dataset (2)


Data Semantics


Challenge (2min)


A Tidy Dataset (1)


A Tidy Dataset (2)


Tidy Data (2)

> head(gapminder)
      country year      pop continent lifeExp gdpPercap
1 Afghanistan 1952  8425333      Asia  28.801  779.4453
2 Afghanistan 1957  9240934      Asia  30.332  820.8530
3 Afghanistan 1962 10267083      Asia  31.997  853.1007
4 Afghanistan 1967 11537966      Asia  34.020  836.1971
5 Afghanistan 1972 13079460      Asia  36.088  739.9811
6 Afghanistan 1977 14880372      Asia  38.438  786.1134

Long v Wide


gapminder Wide Dataset

> gap_wide <- read.csv("data/gapminder_wide.csv", stringsAsFactors = FALSE)
> str(gap_wide)
'data.frame': 142 obs. of  38 variables:
 $ continent     : chr  "Africa" "Africa" "Africa" "Africa" ...
 $ country       : chr  "Algeria" "Angola" "Benin" "Botswana" ...
 $ gdpPercap_1952: num  2449 3521 1063 851 543 ...
 $ gdpPercap_1957: num  3014 3828 960 918 617 ...
 $ gdpPercap_1962: num  2551 4269 949 984 723 ...
 $ gdpPercap_1967: num  3247 5523 1036 1215 795 ...
 $ gdpPercap_1972: num  4183 5473 1086 2264 855 ...
 $ gdpPercap_1977: num  4910 3009 1029 3215 743 ...
 $ gdpPercap_1982: num  5745 2757 1278 4551 807 ...
 $ gdpPercap_1987: num  5681 2430 1226 6206 912 ...
 $ gdpPercap_1992: num  5023 2628 1191 7954 932 ...
 $ gdpPercap_1997: num  4797 2277 1233 8647 946 ...
 $ gdpPercap_2002: num  5288 2773 1373 11004 1038 ...
 $ gdpPercap_2007: num  6223 4797 1441 12570 1217 ...
 $ lifeExp_1952  : num  43.1 30 38.2 47.6 32 ...
 $ lifeExp_1957  : num  45.7 32 40.4 49.6 34.9 ...
 $ lifeExp_1962  : num  48.3 34 42.6 51.5 37.8 ...
 $ lifeExp_1967  : num  51.4 36 44.9 53.3 40.7 ...
 $ lifeExp_1972  : num  54.5 37.9 47 56 43.6 ...
 $ lifeExp_1977  : num  58 39.5 49.2 59.3 46.1 ...
 $ lifeExp_1982  : num  61.4 39.9 50.9 61.5 48.1 ...
 $ lifeExp_1987  : num  65.8 39.9 52.3 63.6 49.6 ...
 $ lifeExp_1992  : num  67.7 40.6 53.9 62.7 50.3 ...
 $ lifeExp_1997  : num  69.2 41 54.8 52.6 50.3 ...
 $ lifeExp_2002  : num  71 41 54.4 46.6 50.6 ...
 $ lifeExp_2007  : num  72.3 42.7 56.7 50.7 52.3 ...
 $ pop_1952      : num  9279525 4232095 1738315 442308 4469979 ...
 $ pop_1957      : num  10270856 4561361 1925173 474639 4713416 ...
 $ pop_1962      : num  11000948 4826015 2151895 512764 4919632 ...
 $ pop_1967      : num  12760499 5247469 2427334 553541 5127935 ...
 $ pop_1972      : num  14760787 5894858 2761407 619351 5433886 ...
 $ pop_1977      : num  17152804 6162675 3168267 781472 5889574 ...
 $ pop_1982      : num  20033753 7016384 3641603 970347 6634596 ...
 $ pop_1987      : num  23254956 7874230 4243788 1151184 7586551 ...
 $ pop_1992      : num  26298373 8735988 4981671 1342614 8878303 ...
 $ pop_1997      : num  29072015 9875024 6066080 1536536 10352843 ...
 $ pop_2002      : int  31287142 10866106 7026113 1630347 12251209 7021078 15929988 4048013 8835739 614382 ...
 $ pop_2007      : int  33333216 12420476 8078314 1639131 14326203 8390505 17696293 4369038 10238807 710960 ...

Pivot: Wide to Long

> gap_long <- gap_wide %>%
+     pivot_longer(
+         cols = c(starts_with('pop'), starts_with('lifeExp'), starts_with('gdpPercap')),
+         names_to = "obstype_year", values_to = "obs_values"
+     )
> str(gap_long)
tibble [5,112 × 4] (S3: tbl_df/tbl/data.frame)
 $ continent   : chr [1:5112] "Africa" "Africa" "Africa" "Africa" ...
 $ country     : chr [1:5112] "Algeria" "Algeria" "Algeria" "Algeria" ...
 $ obstype_year: chr [1:5112] "pop_1952" "pop_1957" "pop_1962" "pop_1967" ...
 $ obs_values  : num [1:5112] 9279525 10270856 11000948 12760499 14760787 ...

separate()

> gap_long <- gap_long %>% separate(obstype_year, into = c('obs_type', 'year'), sep = "_")
> gap_long$year <- as.integer(gap_long$year)
> str(gap_long)
tibble [5,112 × 5] (S3: tbl_df/tbl/data.frame)
 $ continent : chr [1:5112] "Africa" "Africa" "Africa" "Africa" ...
 $ country   : chr [1:5112] "Algeria" "Algeria" "Algeria" "Algeria" ...
 $ obs_type  : chr [1:5112] "pop" "pop" "pop" "pop" ...
 $ year      : int [1:5112] 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
 $ obs_values: num [1:5112] 9279525 10270856 11000948 12760499 14760787 ...

Pivot: Long to Wide

> gap_normal <- gap_long %>% pivot_wider(names_from = obs_type, values_from = obs_values)
> str(gap_normal)
tibble [1,704 × 6] (S3: tbl_df/tbl/data.frame)
 $ continent: chr [1:1704] "Africa" "Africa" "Africa" "Africa" ...
 $ country  : chr [1:1704] "Algeria" "Algeria" "Algeria" "Algeria" ...
 $ year     : int [1:1704] 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
 $ pop      : num [1:1704] 9279525 10270856 11000948 12760499 14760787 ...
 $ lifeExp  : num [1:1704] 43.1 45.7 48.3 51.4 54.5 ...
 $ gdpPercap: num [1:1704] 2449 3014 2551 3247 4183 ...

Challenge (5min)

gap_long %>% group_by(continent, obs_type) %>%
   summarize(means=mean(obs_values))