Supporting Statistical Analysis for Research
6.3 Aggregating data
These examples use the Car.csv data set.
Load the
Car.csvdata set.library(tidyverse)car_path <- file.path("..", "datasets", "Car.csv") car_in <- read_csv(car_path, col_types = cols())Warning: Missing column names filled in: 'X1' [1]car <- car_in %>% rename( id = X1 ) glimpse(car)Observations: 4,654 Variables: 71 $ id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ... $ choice <chr> "choice1", "choice2", "choice5", "choice5", "choice... $ college <dbl> 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, ... $ hsg2 <dbl> 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, ... $ coml5 <dbl> 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, ... $ type1 <chr> "van", "regcar", "regcar", "regcar", "regcar", "tru... $ type2 <chr> "regcar", "van", "truck", "truck", "truck", "regcar... $ type3 <chr> "van", "regcar", "regcar", "regcar", "regcar", "tru... $ type4 <chr> "stwagon", "stwagon", "van", "van", "van", "van", "... $ type5 <chr> "van", "regcar", "regcar", "regcar", "regcar", "tru... $ type6 <chr> "truck", "truck", "stwagon", "stwagon", "stwagon", ... $ fuel1 <chr> "cng", "methanol", "cng", "methanol", "cng", "cng",... $ fuel2 <chr> "cng", "methanol", "cng", "methanol", "cng", "cng",... $ fuel3 <chr> "electric", "cng", "electric", "cng", "electric", "... $ fuel4 <chr> "electric", "cng", "electric", "cng", "electric", "... $ fuel5 <chr> "gasoline", "gasoline", "gasoline", "electric", "ga... $ fuel6 <chr> "gasoline", "gasoline", "gasoline", "electric", "ga... $ price1 <dbl> 4.175345, 3.310947, 4.039574, 7.065968, 5.794157, 3... $ price2 <dbl> 4.175345, 3.310947, 4.039574, 7.065968, 5.794157, 3... $ price3 <dbl> 4.8177056, 3.5868590, 2.7772075, 7.3871485, 6.34598... $ price4 <dbl> 4.8177056, 3.5868590, 2.7772075, 7.3871485, 6.34598... $ price5 <dbl> 5.1388859, 4.4145957, 3.2821543, 5.4600663, 4.69050... $ price6 <dbl> 5.1388859, 4.4145957, 3.2821543, 5.4600663, 4.69050... $ range1 <dbl> 250, 125, 300, 200, 75, 300, 125, 125, 125, 200, 20... $ range2 <dbl> 250, 125, 300, 200, 75, 300, 125, 125, 125, 200, 20... $ range3 <dbl> 400, 300, 250, 75, 300, 250, 300, 300, 250, 75, 75,... $ range4 <dbl> 400, 300, 250, 75, 300, 250, 300, 300, 250, 75, 75,... $ range5 <dbl> 250, 300, 300, 300, 350, 300, 300, 300, 300, 300, 3... $ range6 <dbl> 250, 300, 300, 300, 350, 300, 300, 300, 300, 300, 3... $ acc1 <dbl> 4.0, 2.5, 6.0, 4.0, 4.0, 6.0, 6.0, 6.0, 2.5, 4.0, 4... $ acc2 <dbl> 4.0, 2.5, 6.0, 4.0, 4.0, 6.0, 6.0, 6.0, 2.5, 4.0, 4... $ acc3 <dbl> 6.0, 4.0, 2.5, 6.0, 6.0, 2.5, 2.5, 2.5, 4.0, 6.0, 6... $ acc4 <dbl> 6.0, 4.0, 2.5, 6.0, 6.0, 2.5, 2.5, 2.5, 4.0, 6.0, 6... $ acc5 <dbl> 2.5, 6.0, 4.0, 2.5, 2.5, 4.0, 4.0, 4.0, 6.0, 2.5, 2... $ acc6 <dbl> 2.5, 6.0, 4.0, 2.5, 2.5, 4.0, 4.0, 4.0, 6.0, 2.5, 2... $ speed1 <dbl> 95, 85, 140, 100, 85, 85, 100, 85, 65, 85, 55, 85, ... $ speed2 <dbl> 95, 85, 140, 100, 85, 85, 100, 85, 65, 85, 55, 85, ... $ speed3 <dbl> 110, 140, 85, 85, 95, 95, 85, 140, 140, 140, 95, 85... $ speed4 <dbl> 110, 140, 85, 85, 95, 95, 85, 140, 140, 140, 95, 85... $ speed5 <dbl> 140, 95, 95, 95, 110, 110, 110, 95, 85, 85, 110, 95... $ speed6 <dbl> 140, 95, 95, 95, 110, 110, 110, 95, 85, 85, 110, 95... $ pollution1 <dbl> 0.6, 0.0, 0.1, 0.0, 0.1, 0.6, 0.0, 0.0, 0.0, 0.0, 0... $ pollution2 <dbl> 0.6, 0.0, 0.1, 0.0, 0.1, 0.6, 0.0, 0.0, 0.0, 0.0, 0... $ pollution3 <dbl> 0.25, 0.40, 0.40, 0.25, 0.40, 0.25, 0.25, 0.40, 0.7... $ pollution4 <dbl> 0.25, 0.40, 0.40, 0.25, 0.40, 0.25, 0.25, 0.40, 0.7... $ pollution5 <dbl> 0.50, 0.25, 0.75, 0.60, 0.75, 0.50, 1.00, 0.25, 0.2... $ pollution6 <dbl> 0.50, 0.25, 0.75, 0.60, 0.75, 0.50, 1.00, 0.25, 0.2... $ size1 <dbl> 3, 3, 2, 2, 1, 3, 2, 3, 3, 2, 3, 3, 0, 2, 1, 3, 3, ... $ size2 <dbl> 3, 3, 2, 2, 1, 3, 2, 3, 3, 2, 3, 3, 0, 2, 1, 3, 3, ... $ size3 <dbl> 2, 3, 3, 3, 2, 2, 3, 3, 3, 3, 1, 1, 1, 3, 2, 1, 3, ... $ size4 <dbl> 2, 3, 3, 3, 2, 2, 3, 3, 3, 3, 1, 1, 1, 3, 2, 1, 3, ... $ size5 <dbl> 3, 2, 3, 3, 3, 3, 3, 2, 2, 1, 2, 2, 2, 3, 3, 2, 2, ... $ size6 <dbl> 3, 2, 3, 3, 3, 3, 3, 2, 2, 1, 2, 2, 2, 3, 3, 2, 2, ... $ space1 <dbl> 0.7, 0.7, 1.0, 1.0, 0.7, 1.0, 1.0, 1.0, 0.7, 1.0, 1... $ space2 <dbl> 0.7, 0.7, 1.0, 1.0, 0.7, 1.0, 1.0, 1.0, 0.7, 1.0, 1... $ space3 <dbl> 1.0, 1.0, 1.0, 0.7, 1.0, 1.0, 0.7, 0.7, 1.0, 0.7, 1... $ space4 <dbl> 1.0, 1.0, 1.0, 0.7, 1.0, 1.0, 0.7, 0.7, 1.0, 0.7, 1... $ space5 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... $ space6 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... $ cost1 <dbl> 4, 4, 6, 8, 6, 6, 8, 1, 4, 2, 2, 2, 8, 1, 1, 8, 8, ... $ cost2 <dbl> 4, 4, 6, 8, 6, 6, 8, 1, 4, 2, 2, 2, 8, 1, 1, 8, 8, ... $ cost3 <dbl> 6, 8, 8, 2, 8, 8, 4, 4, 4, 2, 4, 8, 4, 2, 4, 8, 2, ... $ cost4 <dbl> 6, 8, 8, 2, 8, 8, 4, 4, 4, 2, 4, 8, 4, 2, 4, 8, 2, ... $ cost5 <dbl> 8, 4, 2, 4, 2, 2, 8, 8, 6, 4, 6, 2, 8, 4, 6, 4, 6, ... $ cost6 <dbl> 8, 4, 2, 4, 2, 2, 8, 8, 6, 4, 6, 2, 8, 4, 6, 4, 6, ... $ station1 <dbl> 0.1, 0.0, 0.1, 0.0, 0.3, 0.1, 0.0, 0.0, 0.0, 0.0, 0... $ station2 <dbl> 0.1, 0.0, 0.1, 0.0, 0.3, 0.1, 0.0, 0.0, 0.0, 0.0, 0... $ station3 <dbl> 0.3, 0.1, 0.3, 0.7, 0.7, 0.3, 0.7, 0.7, 0.3, 0.7, 0... $ station4 <dbl> 0.3, 0.1, 0.3, 0.7, 0.7, 0.3, 0.7, 0.7, 0.3, 0.7, 0... $ station5 <dbl> 1.0, 1.0, 1.0, 0.1, 1.0, 1.0, 1.0, 1.0, 1.0, 0.1, 0... $ station6 <dbl> 1.0, 1.0, 1.0, 0.1, 1.0, 1.0, 1.0, 1.0, 1.0, 0.1, 0...Create a categorical variable that identifies the body type that was selected first.
car <- car %>% gather(key = type_order, value = type, type1:type6) %>% separate(type_order, into = c("X1","order"), sep = "type") %>% separate(choice, into = c("X2","choice"), sep = "choice") %>% select(-X1, -X2) %>% filter(choice == order) %>% arrange(id, order) car %>% select(choice, id, college, hsg2, coml5, type) %>% head()# A tibble: 6 x 6 choice id college hsg2 coml5 type <chr> <dbl> <dbl> <dbl> <dbl> <chr> 1 1 1 0 0 0 van 2 2 2 1 1 1 van 3 5 3 0 1 0 regcar 4 5 4 0 0 1 regcar 5 5 5 0 1 0 regcar 6 5 6 0 0 0 truckCreate a table of the number of count of the selected types.
car_type_tab <- car %>% group_by(type) %>% summarise(count = n()) %>% ungroup() car_type_tab %>% head()# A tibble: 6 x 2 type count <chr> <int> 1 regcar 2740 2 sportcar 172 3 sportuv 242 4 stwagon 305 5 truck 565 6 van 630Create a table of the number of trucks selected by college degree or not and greater than 2 in the family.
Which of these subgroups selected
truckas their first choice in greatest numbers.car_truck_tab <- car %>% filter(type == "truck") %>% group_by(college, hsg2) %>% summarise(count = n()) %>% ungroup() %>% spread(key = college, value = count) car_truck_tab %>% head()# A tibble: 2 x 3 hsg2 `0` `1` <dbl> <int> <int> 1 0 133 298 2 1 52 82Create the same table with the cells reporting the proportion of these groups that select trucks.
Which of these subgroups was most likely to selected
truckas their first choice.car_truck_tab <- car %>% group_by(college, hsg2) %>% summarise(proportion = sum(type == "truck") / n()) %>% ungroup() %>% spread(key = college, value = proportion) car_truck_tab %>% head()# A tibble: 2 x 3 hsg2 `0` `1` <dbl> <dbl> <dbl> 1 0 0.171 0.105 2 1 0.174 0.112Create a variable that identifies the price of the first selected vehicle.
car <- car %>% gather(key = price_order, value = price, price1:price6) %>% separate(price_order, into = c("X1", "order"), sep = "price") %>% select(-X1) %>% filter(choice == order) %>% arrange(id) car %>% select(id, college, hsg2, coml5, type, price) %>% head()# A tibble: 6 x 6 id college hsg2 coml5 type price <dbl> <dbl> <dbl> <dbl> <chr> <dbl> 1 1 0 0 0 van 4.18 2 2 1 1 1 van 3.31 3 3 0 1 0 regcar 3.28 4 4 0 0 1 regcar 5.46 5 5 0 1 0 regcar 4.69 6 6 0 0 0 truck 4.82Create a table that provides the mean and standard deviation of price by college attendance or not groups.
Note, income is not provide in the data set. So price can not be put back on to the original scale. What is being reported is the price normalized by income.
car_price_tab <- car %>% group_by(college) %>% summarise( mean = mean(price), st_dev = sd(price) ) %>% ungroup() car_price_tab# A tibble: 2 x 3 college mean st_dev <dbl> <dbl> <dbl> 1 0 4.06 2.07 2 1 4.16 1.81