SSCC - Social Science Computing Cooperative Supporting Statistical Analysis for Research

4.8 Coding missing values - part 2

These exercises use the PSID.csv data set that was imported in the prior section.

  1. Import the PSID.csv data set.

    library(tidyverse)
    psid_path <- file.path("..", "datasets", "PSID.csv")
    psid_in <- read_csv(psid_path, col_types = cols())
    Warning: Missing column names filled in: 'X1' [1]
    psid_in <-
      rename(
        psid_in,
        obs_num = X1,
        intvw_num = intnum,
        person_id = persnum,
        marital_status = married
        )
    
    psid <-
      psid_in %>%
      select(-obs_num)
    glimpse(psid)
    Observations: 4,856
    Variables: 8
    $ intvw_num      <dbl> 4, 4, 4, 4, 5, 6, 6, 7, 7, 7, 10, 10, 10, 11, 1...
    $ person_id      <dbl> 4, 6, 7, 173, 2, 4, 172, 4, 170, 171, 3, 171, 1...
    $ age            <dbl> 39, 35, 33, 39, 47, 44, 38, 38, 39, 37, 48, 47,...
    $ educatn        <dbl> 12, 12, 12, 10, 9, 12, 16, 9, 12, 11, 13, 12, 1...
    $ earnings       <dbl> 77250, 12000, 8000, 15000, 6500, 6500, 7000, 50...
    $ hours          <dbl> 2940, 2040, 693, 1904, 1683, 2024, 1144, 2080, ...
    $ kids           <dbl> 2, 2, 1, 2, 5, 2, 3, 4, 3, 5, 98, 3, 0, 0, 2, 0...
    $ marital_status <chr> "married", "divorced", "married", "married", "m...
  2. Code NA for the NA/DF and no histories values for the marriage status variable.

    psid <-
      psid %>%
      mutate(
        marital_status = case_when(
          marital_status == "NA/DF" ~ NA_character_,
          marital_status == "no histories" ~ NA_character_,
          TRUE ~ marital_status
          )
      )
    
    psid %>%
      filter(is.na(marital_status)) %>%
      select(intvw_num, person_id, age, educatn, kids, marital_status) %>%
      print(n = 15)
    # A tibble: 52 x 6
       intvw_num person_id   age educatn  kids marital_status
           <dbl>     <dbl> <dbl>   <dbl> <dbl> <chr>         
     1       633         5    34      99    99 <NA>          
     2       941       182    49      12    99 <NA>          
     3      1184        21    49       0    99 <NA>          
     4      1709         2    50       0    99 <NA>          
     5      1874       170    48      14    99 <NA>          
     6      1906       170    41      17    99 <NA>          
     7      2508         3    45      17     0 <NA>          
     8      2614         3    37       0    99 <NA>          
     9      2701       185    34       0    99 <NA>          
    10      2704         2    39       0    99 <NA>          
    11      2705       171    48      12    99 <NA>          
    12      2714         3    38      12     1 <NA>          
    13      5195       177    48      99    99 <NA>          
    14      5282       174    36      17     0 <NA>          
    15      5287       177    32      14     1 <NA>          
    # ... with 37 more rows

    or

    psid <-
      psid %>%
      mutate(
        marital_status = 
          recode(
            marital_status,
            `NA/DF` = NA_character_,
            `no histories` = NA_character_
            )
        )
  3. Change the units on the earnings and hours variables to be thousands of dollars or hours. Use a method that operates on multiple columns.

    Hint, to do the unit change on a variable x, one would do x / 1000.

    psid <-
      psid %>%
      mutate_at(
        vars(earnings, hours), 
        ~. /1000
        )
    
    psid %>%
      filter(is.na(marital_status)) %>%
      select(intvw_num, person_id, age, kids, marital_status, earnings, hours) %>%
      print(n = 15)
    # A tibble: 52 x 7
       intvw_num person_id   age  kids marital_status earnings hours
           <dbl>     <dbl> <dbl> <dbl> <chr>             <dbl> <dbl>
     1       633         5    34    99 <NA>               0     0   
     2       941       182    49    99 <NA>               0     0   
     3      1184        21    49    99 <NA>               0     0   
     4      1709         2    50    99 <NA>               0     0   
     5      1874       170    48    99 <NA>              28     2   
     6      1906       170    41    99 <NA>              24.5   1.46
     7      2508         3    45     0 <NA>              42     1.48
     8      2614         3    37    99 <NA>               0     0   
     9      2701       185    34    99 <NA>               0     0   
    10      2704         2    39    99 <NA>               0     0   
    11      2705       171    48    99 <NA>               6.75  1.47
    12      2714         3    38     1 <NA>               8     1.84
    13      5195       177    48    99 <NA>               0     0   
    14      5282       174    36     0 <NA>               0     0   
    15      5287       177    32     1 <NA>              17.2   1.74
    # ... with 37 more rows