SSCC - Social Science Computing Cooperative Supporting Statistical Analysis for Research

6.3 Aggregating data

These examples use the Car.csv data set.

  1. Load the Car.csv data set.

    library(tidyverse)
    car_path <- file.path("..", "datasets", "Car.csv")
    car_in <- read_csv(car_path, col_types = cols())
    Warning: Missing column names filled in: 'X1' [1]
    car <- 
      car_in %>%
      rename(
        id = X1
        )
    
    glimpse(car)
    Observations: 4,654
    Variables: 71
    $ id         <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ...
    $ choice     <chr> "choice1", "choice2", "choice5", "choice5", "choice...
    $ college    <dbl> 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, ...
    $ hsg2       <dbl> 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, ...
    $ coml5      <dbl> 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...
    $ type1      <chr> "van", "regcar", "regcar", "regcar", "regcar", "tru...
    $ type2      <chr> "regcar", "van", "truck", "truck", "truck", "regcar...
    $ type3      <chr> "van", "regcar", "regcar", "regcar", "regcar", "tru...
    $ type4      <chr> "stwagon", "stwagon", "van", "van", "van", "van", "...
    $ type5      <chr> "van", "regcar", "regcar", "regcar", "regcar", "tru...
    $ type6      <chr> "truck", "truck", "stwagon", "stwagon", "stwagon", ...
    $ fuel1      <chr> "cng", "methanol", "cng", "methanol", "cng", "cng",...
    $ fuel2      <chr> "cng", "methanol", "cng", "methanol", "cng", "cng",...
    $ fuel3      <chr> "electric", "cng", "electric", "cng", "electric", "...
    $ fuel4      <chr> "electric", "cng", "electric", "cng", "electric", "...
    $ fuel5      <chr> "gasoline", "gasoline", "gasoline", "electric", "ga...
    $ fuel6      <chr> "gasoline", "gasoline", "gasoline", "electric", "ga...
    $ price1     <dbl> 4.175345, 3.310947, 4.039574, 7.065968, 5.794157, 3...
    $ price2     <dbl> 4.175345, 3.310947, 4.039574, 7.065968, 5.794157, 3...
    $ price3     <dbl> 4.8177056, 3.5868590, 2.7772075, 7.3871485, 6.34598...
    $ price4     <dbl> 4.8177056, 3.5868590, 2.7772075, 7.3871485, 6.34598...
    $ price5     <dbl> 5.1388859, 4.4145957, 3.2821543, 5.4600663, 4.69050...
    $ price6     <dbl> 5.1388859, 4.4145957, 3.2821543, 5.4600663, 4.69050...
    $ range1     <dbl> 250, 125, 300, 200, 75, 300, 125, 125, 125, 200, 20...
    $ range2     <dbl> 250, 125, 300, 200, 75, 300, 125, 125, 125, 200, 20...
    $ range3     <dbl> 400, 300, 250, 75, 300, 250, 300, 300, 250, 75, 75,...
    $ range4     <dbl> 400, 300, 250, 75, 300, 250, 300, 300, 250, 75, 75,...
    $ range5     <dbl> 250, 300, 300, 300, 350, 300, 300, 300, 300, 300, 3...
    $ range6     <dbl> 250, 300, 300, 300, 350, 300, 300, 300, 300, 300, 3...
    $ acc1       <dbl> 4.0, 2.5, 6.0, 4.0, 4.0, 6.0, 6.0, 6.0, 2.5, 4.0, 4...
    $ acc2       <dbl> 4.0, 2.5, 6.0, 4.0, 4.0, 6.0, 6.0, 6.0, 2.5, 4.0, 4...
    $ acc3       <dbl> 6.0, 4.0, 2.5, 6.0, 6.0, 2.5, 2.5, 2.5, 4.0, 6.0, 6...
    $ acc4       <dbl> 6.0, 4.0, 2.5, 6.0, 6.0, 2.5, 2.5, 2.5, 4.0, 6.0, 6...
    $ acc5       <dbl> 2.5, 6.0, 4.0, 2.5, 2.5, 4.0, 4.0, 4.0, 6.0, 2.5, 2...
    $ acc6       <dbl> 2.5, 6.0, 4.0, 2.5, 2.5, 4.0, 4.0, 4.0, 6.0, 2.5, 2...
    $ speed1     <dbl> 95, 85, 140, 100, 85, 85, 100, 85, 65, 85, 55, 85, ...
    $ speed2     <dbl> 95, 85, 140, 100, 85, 85, 100, 85, 65, 85, 55, 85, ...
    $ speed3     <dbl> 110, 140, 85, 85, 95, 95, 85, 140, 140, 140, 95, 85...
    $ speed4     <dbl> 110, 140, 85, 85, 95, 95, 85, 140, 140, 140, 95, 85...
    $ speed5     <dbl> 140, 95, 95, 95, 110, 110, 110, 95, 85, 85, 110, 95...
    $ speed6     <dbl> 140, 95, 95, 95, 110, 110, 110, 95, 85, 85, 110, 95...
    $ pollution1 <dbl> 0.6, 0.0, 0.1, 0.0, 0.1, 0.6, 0.0, 0.0, 0.0, 0.0, 0...
    $ pollution2 <dbl> 0.6, 0.0, 0.1, 0.0, 0.1, 0.6, 0.0, 0.0, 0.0, 0.0, 0...
    $ pollution3 <dbl> 0.25, 0.40, 0.40, 0.25, 0.40, 0.25, 0.25, 0.40, 0.7...
    $ pollution4 <dbl> 0.25, 0.40, 0.40, 0.25, 0.40, 0.25, 0.25, 0.40, 0.7...
    $ pollution5 <dbl> 0.50, 0.25, 0.75, 0.60, 0.75, 0.50, 1.00, 0.25, 0.2...
    $ pollution6 <dbl> 0.50, 0.25, 0.75, 0.60, 0.75, 0.50, 1.00, 0.25, 0.2...
    $ size1      <dbl> 3, 3, 2, 2, 1, 3, 2, 3, 3, 2, 3, 3, 0, 2, 1, 3, 3, ...
    $ size2      <dbl> 3, 3, 2, 2, 1, 3, 2, 3, 3, 2, 3, 3, 0, 2, 1, 3, 3, ...
    $ size3      <dbl> 2, 3, 3, 3, 2, 2, 3, 3, 3, 3, 1, 1, 1, 3, 2, 1, 3, ...
    $ size4      <dbl> 2, 3, 3, 3, 2, 2, 3, 3, 3, 3, 1, 1, 1, 3, 2, 1, 3, ...
    $ size5      <dbl> 3, 2, 3, 3, 3, 3, 3, 2, 2, 1, 2, 2, 2, 3, 3, 2, 2, ...
    $ size6      <dbl> 3, 2, 3, 3, 3, 3, 3, 2, 2, 1, 2, 2, 2, 3, 3, 2, 2, ...
    $ space1     <dbl> 0.7, 0.7, 1.0, 1.0, 0.7, 1.0, 1.0, 1.0, 0.7, 1.0, 1...
    $ space2     <dbl> 0.7, 0.7, 1.0, 1.0, 0.7, 1.0, 1.0, 1.0, 0.7, 1.0, 1...
    $ space3     <dbl> 1.0, 1.0, 1.0, 0.7, 1.0, 1.0, 0.7, 0.7, 1.0, 0.7, 1...
    $ space4     <dbl> 1.0, 1.0, 1.0, 0.7, 1.0, 1.0, 0.7, 0.7, 1.0, 0.7, 1...
    $ space5     <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
    $ space6     <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
    $ cost1      <dbl> 4, 4, 6, 8, 6, 6, 8, 1, 4, 2, 2, 2, 8, 1, 1, 8, 8, ...
    $ cost2      <dbl> 4, 4, 6, 8, 6, 6, 8, 1, 4, 2, 2, 2, 8, 1, 1, 8, 8, ...
    $ cost3      <dbl> 6, 8, 8, 2, 8, 8, 4, 4, 4, 2, 4, 8, 4, 2, 4, 8, 2, ...
    $ cost4      <dbl> 6, 8, 8, 2, 8, 8, 4, 4, 4, 2, 4, 8, 4, 2, 4, 8, 2, ...
    $ cost5      <dbl> 8, 4, 2, 4, 2, 2, 8, 8, 6, 4, 6, 2, 8, 4, 6, 4, 6, ...
    $ cost6      <dbl> 8, 4, 2, 4, 2, 2, 8, 8, 6, 4, 6, 2, 8, 4, 6, 4, 6, ...
    $ station1   <dbl> 0.1, 0.0, 0.1, 0.0, 0.3, 0.1, 0.0, 0.0, 0.0, 0.0, 0...
    $ station2   <dbl> 0.1, 0.0, 0.1, 0.0, 0.3, 0.1, 0.0, 0.0, 0.0, 0.0, 0...
    $ station3   <dbl> 0.3, 0.1, 0.3, 0.7, 0.7, 0.3, 0.7, 0.7, 0.3, 0.7, 0...
    $ station4   <dbl> 0.3, 0.1, 0.3, 0.7, 0.7, 0.3, 0.7, 0.7, 0.3, 0.7, 0...
    $ station5   <dbl> 1.0, 1.0, 1.0, 0.1, 1.0, 1.0, 1.0, 1.0, 1.0, 0.1, 0...
    $ station6   <dbl> 1.0, 1.0, 1.0, 0.1, 1.0, 1.0, 1.0, 1.0, 1.0, 0.1, 0...
  2. Create a categorical variable that identifies the body type that was selected first.

    car <- 
      car %>%
      gather(key = type_order, value = type, type1:type6) %>%
      separate(type_order, into = c("X1","order"), sep = "type") %>%
      separate(choice, into = c("X2","choice"), sep = "choice") %>%
      select(-X1, -X2) %>%
      filter(choice == order) %>%
      arrange(id, order)
    
    car %>%
      select(choice, id,  college, hsg2, coml5, type) %>%
      head()
    # A tibble: 6 x 6
      choice    id college  hsg2 coml5 type  
      <chr>  <dbl>   <dbl> <dbl> <dbl> <chr> 
    1 1          1       0     0     0 van   
    2 2          2       1     1     1 van   
    3 5          3       0     1     0 regcar
    4 5          4       0     0     1 regcar
    5 5          5       0     1     0 regcar
    6 5          6       0     0     0 truck 
  3. Create a table of the number of count of the selected types.

    car_type_tab <- 
      car %>%
      group_by(type) %>%
      summarise(count = n()) %>%
      ungroup()
    
    car_type_tab %>%
      head()
    # A tibble: 6 x 2
      type     count
      <chr>    <int>
    1 regcar    2740
    2 sportcar   172
    3 sportuv    242
    4 stwagon    305
    5 truck      565
    6 van        630
  4. Create a table of the number of trucks selected by college degree or not and greater than 2 in the family.

    Which of these subgroups selected truck as their first choice in greatest numbers.

    car_truck_tab <- 
      car %>%
      filter(type == "truck") %>%
      group_by(college, hsg2) %>%
      summarise(count = n()) %>%
      ungroup() %>%
      spread(key = college, value = count)
    
    car_truck_tab %>%
      head()
    # A tibble: 2 x 3
       hsg2   `0`   `1`
      <dbl> <int> <int>
    1     0   133   298
    2     1    52    82
  5. Create the same table with the cells reporting the proportion of these groups that select trucks.

    Which of these subgroups was most likely to selected truck as their first choice.

    car_truck_tab <- 
      car %>%
      group_by(college, hsg2) %>%
      summarise(proportion = sum(type == "truck") / n()) %>%
      ungroup() %>%
      spread(key = college, value = proportion)
    
    car_truck_tab %>%
      head()
    # A tibble: 2 x 3
       hsg2   `0`   `1`
      <dbl> <dbl> <dbl>
    1     0 0.171 0.105
    2     1 0.174 0.112
  6. Create a variable that identifies the price of the first selected vehicle.

    car <- 
      car %>%
      gather(key = price_order, value = price, price1:price6) %>%
      separate(price_order, into = c("X1", "order"), sep = "price") %>%
      select(-X1) %>%
      filter(choice == order) %>%
      arrange(id)
    
    car %>%
      select(id,  college, hsg2, coml5, type, price) %>%
      head()
    # A tibble: 6 x 6
         id college  hsg2 coml5 type   price
      <dbl>   <dbl> <dbl> <dbl> <chr>  <dbl>
    1     1       0     0     0 van     4.18
    2     2       1     1     1 van     3.31
    3     3       0     1     0 regcar  3.28
    4     4       0     0     1 regcar  5.46
    5     5       0     1     0 regcar  4.69
    6     6       0     0     0 truck   4.82
  7. Create a table that provides the mean and standard deviation of price by college attendance or not groups.

    Note, income is not provide in the data set. So price can not be put back on to the original scale. What is being reported is the price normalized by income.

    car_price_tab <- 
      car %>%
      group_by(college) %>%
      summarise(
        mean = mean(price),
        st_dev = sd(price)
        ) %>%
      ungroup()
    
    car_price_tab
    # A tibble: 2 x 3
      college  mean st_dev
        <dbl> <dbl>  <dbl>
    1       0  4.06   2.07
    2       1  4.16   1.81