 Supporting Statistical Analysis for Research
  Supporting Statistical Analysis for Research
 6.2 Tidy data
These examples use the Car.csv data set.
- Load the - Car.csvdata set.- library(tidyverse)- car_path <- file.path("..", "datasets", "Car.csv") car_in <- read_csv(car_path, col_types = cols())- Warning: Missing column names filled in: 'X1' [1]- car <- car_in %>% rename( id = X1 ) glimpse(car)- Observations: 4,654 Variables: 71 $ id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ... $ choice <chr> "choice1", "choice2", "choice5", "choice5", "choice... $ college <dbl> 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, ... $ hsg2 <dbl> 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, ... $ coml5 <dbl> 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, ... $ type1 <chr> "van", "regcar", "regcar", "regcar", "regcar", "tru... $ type2 <chr> "regcar", "van", "truck", "truck", "truck", "regcar... $ type3 <chr> "van", "regcar", "regcar", "regcar", "regcar", "tru... $ type4 <chr> "stwagon", "stwagon", "van", "van", "van", "van", "... $ type5 <chr> "van", "regcar", "regcar", "regcar", "regcar", "tru... $ type6 <chr> "truck", "truck", "stwagon", "stwagon", "stwagon", ... $ fuel1 <chr> "cng", "methanol", "cng", "methanol", "cng", "cng",... $ fuel2 <chr> "cng", "methanol", "cng", "methanol", "cng", "cng",... $ fuel3 <chr> "electric", "cng", "electric", "cng", "electric", "... $ fuel4 <chr> "electric", "cng", "electric", "cng", "electric", "... $ fuel5 <chr> "gasoline", "gasoline", "gasoline", "electric", "ga... $ fuel6 <chr> "gasoline", "gasoline", "gasoline", "electric", "ga... $ price1 <dbl> 4.175345, 3.310947, 4.039574, 7.065968, 5.794157, 3... $ price2 <dbl> 4.175345, 3.310947, 4.039574, 7.065968, 5.794157, 3... $ price3 <dbl> 4.8177056, 3.5868590, 2.7772075, 7.3871485, 6.34598... $ price4 <dbl> 4.8177056, 3.5868590, 2.7772075, 7.3871485, 6.34598... $ price5 <dbl> 5.1388859, 4.4145957, 3.2821543, 5.4600663, 4.69050... $ price6 <dbl> 5.1388859, 4.4145957, 3.2821543, 5.4600663, 4.69050... $ range1 <dbl> 250, 125, 300, 200, 75, 300, 125, 125, 125, 200, 20... $ range2 <dbl> 250, 125, 300, 200, 75, 300, 125, 125, 125, 200, 20... $ range3 <dbl> 400, 300, 250, 75, 300, 250, 300, 300, 250, 75, 75,... $ range4 <dbl> 400, 300, 250, 75, 300, 250, 300, 300, 250, 75, 75,... $ range5 <dbl> 250, 300, 300, 300, 350, 300, 300, 300, 300, 300, 3... $ range6 <dbl> 250, 300, 300, 300, 350, 300, 300, 300, 300, 300, 3... $ acc1 <dbl> 4.0, 2.5, 6.0, 4.0, 4.0, 6.0, 6.0, 6.0, 2.5, 4.0, 4... $ acc2 <dbl> 4.0, 2.5, 6.0, 4.0, 4.0, 6.0, 6.0, 6.0, 2.5, 4.0, 4... $ acc3 <dbl> 6.0, 4.0, 2.5, 6.0, 6.0, 2.5, 2.5, 2.5, 4.0, 6.0, 6... $ acc4 <dbl> 6.0, 4.0, 2.5, 6.0, 6.0, 2.5, 2.5, 2.5, 4.0, 6.0, 6... $ acc5 <dbl> 2.5, 6.0, 4.0, 2.5, 2.5, 4.0, 4.0, 4.0, 6.0, 2.5, 2... $ acc6 <dbl> 2.5, 6.0, 4.0, 2.5, 2.5, 4.0, 4.0, 4.0, 6.0, 2.5, 2... $ speed1 <dbl> 95, 85, 140, 100, 85, 85, 100, 85, 65, 85, 55, 85, ... $ speed2 <dbl> 95, 85, 140, 100, 85, 85, 100, 85, 65, 85, 55, 85, ... $ speed3 <dbl> 110, 140, 85, 85, 95, 95, 85, 140, 140, 140, 95, 85... $ speed4 <dbl> 110, 140, 85, 85, 95, 95, 85, 140, 140, 140, 95, 85... $ speed5 <dbl> 140, 95, 95, 95, 110, 110, 110, 95, 85, 85, 110, 95... $ speed6 <dbl> 140, 95, 95, 95, 110, 110, 110, 95, 85, 85, 110, 95... $ pollution1 <dbl> 0.6, 0.0, 0.1, 0.0, 0.1, 0.6, 0.0, 0.0, 0.0, 0.0, 0... $ pollution2 <dbl> 0.6, 0.0, 0.1, 0.0, 0.1, 0.6, 0.0, 0.0, 0.0, 0.0, 0... $ pollution3 <dbl> 0.25, 0.40, 0.40, 0.25, 0.40, 0.25, 0.25, 0.40, 0.7... $ pollution4 <dbl> 0.25, 0.40, 0.40, 0.25, 0.40, 0.25, 0.25, 0.40, 0.7... $ pollution5 <dbl> 0.50, 0.25, 0.75, 0.60, 0.75, 0.50, 1.00, 0.25, 0.2... $ pollution6 <dbl> 0.50, 0.25, 0.75, 0.60, 0.75, 0.50, 1.00, 0.25, 0.2... $ size1 <dbl> 3, 3, 2, 2, 1, 3, 2, 3, 3, 2, 3, 3, 0, 2, 1, 3, 3, ... $ size2 <dbl> 3, 3, 2, 2, 1, 3, 2, 3, 3, 2, 3, 3, 0, 2, 1, 3, 3, ... $ size3 <dbl> 2, 3, 3, 3, 2, 2, 3, 3, 3, 3, 1, 1, 1, 3, 2, 1, 3, ... $ size4 <dbl> 2, 3, 3, 3, 2, 2, 3, 3, 3, 3, 1, 1, 1, 3, 2, 1, 3, ... $ size5 <dbl> 3, 2, 3, 3, 3, 3, 3, 2, 2, 1, 2, 2, 2, 3, 3, 2, 2, ... $ size6 <dbl> 3, 2, 3, 3, 3, 3, 3, 2, 2, 1, 2, 2, 2, 3, 3, 2, 2, ... $ space1 <dbl> 0.7, 0.7, 1.0, 1.0, 0.7, 1.0, 1.0, 1.0, 0.7, 1.0, 1... $ space2 <dbl> 0.7, 0.7, 1.0, 1.0, 0.7, 1.0, 1.0, 1.0, 0.7, 1.0, 1... $ space3 <dbl> 1.0, 1.0, 1.0, 0.7, 1.0, 1.0, 0.7, 0.7, 1.0, 0.7, 1... $ space4 <dbl> 1.0, 1.0, 1.0, 0.7, 1.0, 1.0, 0.7, 0.7, 1.0, 0.7, 1... $ space5 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... $ space6 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... $ cost1 <dbl> 4, 4, 6, 8, 6, 6, 8, 1, 4, 2, 2, 2, 8, 1, 1, 8, 8, ... $ cost2 <dbl> 4, 4, 6, 8, 6, 6, 8, 1, 4, 2, 2, 2, 8, 1, 1, 8, 8, ... $ cost3 <dbl> 6, 8, 8, 2, 8, 8, 4, 4, 4, 2, 4, 8, 4, 2, 4, 8, 2, ... $ cost4 <dbl> 6, 8, 8, 2, 8, 8, 4, 4, 4, 2, 4, 8, 4, 2, 4, 8, 2, ... $ cost5 <dbl> 8, 4, 2, 4, 2, 2, 8, 8, 6, 4, 6, 2, 8, 4, 6, 4, 6, ... $ cost6 <dbl> 8, 4, 2, 4, 2, 2, 8, 8, 6, 4, 6, 2, 8, 4, 6, 4, 6, ... $ station1 <dbl> 0.1, 0.0, 0.1, 0.0, 0.3, 0.1, 0.0, 0.0, 0.0, 0.0, 0... $ station2 <dbl> 0.1, 0.0, 0.1, 0.0, 0.3, 0.1, 0.0, 0.0, 0.0, 0.0, 0... $ station3 <dbl> 0.3, 0.1, 0.3, 0.7, 0.7, 0.3, 0.7, 0.7, 0.3, 0.7, 0... $ station4 <dbl> 0.3, 0.1, 0.3, 0.7, 0.7, 0.3, 0.7, 0.7, 0.3, 0.7, 0... $ station5 <dbl> 1.0, 1.0, 1.0, 0.1, 1.0, 1.0, 1.0, 1.0, 1.0, 0.1, 0... $ station6 <dbl> 1.0, 1.0, 1.0, 0.1, 1.0, 1.0, 1.0, 1.0, 1.0, 0.1, 0...
- Create a new data frame that uses only the variables for vehicle chosen, demographics, and body type options. Tidy this new data frame. - car_type <- car %>% select(id:type6) %>% gather(key = type_order, value = type, -id:-coml5) %>% separate(type_order, into = c("X1","order"), sep = "type") %>% select(-X1) %>% arrange(id, order) car_type %>% select(choice, order, college, hsg2, coml5, type) %>% head()- # A tibble: 6 x 6 choice order college hsg2 coml5 type <chr> <chr> <dbl> <dbl> <dbl> <chr> 1 choice1 1 0 0 0 van 2 choice1 2 0 0 0 regcar 3 choice1 3 0 0 0 van 4 choice1 4 0 0 0 stwagon 5 choice1 5 0 0 0 van 6 choice1 6 0 0 0 truck
- Using the data set with all the variables on the vehicle options, Tidy the data set. - To gather multiple variables from multiple sets of columns, one gathers on all columns from all the sets. Then modify the variable created as the - keyto contain the names of the new variables. Finish by spreading using the column that contains the variable names. This is a common set of steps used when shapping a data frame. That is make the data frame as tall enough to contain everything that will be spread. Then spread to the desired columns.- car_temp <- car %>% gather(key = type_order, value = value, -id:-coml5) %>% separate(type_order, into = c("variable", "order"), sep = -1) %>% spread(key = variable, value = value) %>% arrange(id, order) glimpse(car_temp)- Observations: 27,924 Variables: 17 $ id <dbl> 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3... $ choice <chr> "choice1", "choice1", "choice1", "choice1", "choice1... $ college <dbl> 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0... $ hsg2 <dbl> 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1... $ coml5 <dbl> 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0... $ order <chr> "1", "2", "3", "4", "5", "6", "1", "2", "3", "4", "5... $ acc <chr> "4", "4", "6", "6", "2.5", "2.5", "2.5", "2.5", "4",... $ cost <chr> "4", "4", "6", "6", "8", "8", "4", "4", "8", "8", "4... $ fuel <chr> "cng", "cng", "electric", "electric", "gasoline", "g... $ pollution <chr> "0.6", "0.6", "0.25", "0.25", "0.5", "0.5", "0", "0"... $ price <chr> "4.1753448", "4.1753448", "4.8177056", "4.8177056", ... $ range <chr> "250", "250", "400", "400", "250", "250", "125", "12... $ size <chr> "3", "3", "2", "2", "3", "3", "3", "3", "3", "3", "2... $ space <chr> "0.7", "0.7", "1", "1", "1", "1", "0.7", "0.7", "1",... $ speed <chr> "95", "95", "110", "110", "140", "140", "85", "85", ... $ station <chr> "0.1", "0.1", "0.3", "0.3", "1", "1", "0", "0", "0.1... $ type <chr> "van", "regcar", "van", "stwagon", "van", "truck", "...