SSCC - Social Science Computing Cooperative Supporting Statistical Analysis for Research

4.5 Dropping unneeded observations

These exercises use the PSID.csv data set that was imported in the prior section.

  1. Import the PSID.csv data set.

    library(tidyverse)
    psid_path <- file.path("..", "datasets", "PSID.csv")
    psid_in <- read_csv(psid_path, col_types = cols())
    Warning: Missing column names filled in: 'X1' [1]
    psid_in <-
      rename(
        psid_in,
        obs_num = X1,
        intvw_num = intnum,
        person_id = persnum,
        marital_status = married
        )
    
    psid <-
      psid_in %>%
      select(-obs_num)
    glimpse(psid)
    Observations: 4,856
    Variables: 8
    $ intvw_num      <dbl> 4, 4, 4, 4, 5, 6, 6, 7, 7, 7, 10, 10, 10, 11, 1...
    $ person_id      <dbl> 4, 6, 7, 173, 2, 4, 172, 4, 170, 171, 3, 171, 1...
    $ age            <dbl> 39, 35, 33, 39, 47, 44, 38, 38, 39, 37, 48, 47,...
    $ educatn        <dbl> 12, 12, 12, 10, 9, 12, 16, 9, 12, 11, 13, 12, 1...
    $ earnings       <dbl> 77250, 12000, 8000, 15000, 6500, 6500, 7000, 50...
    $ hours          <dbl> 2940, 2040, 693, 1904, 1683, 2024, 1144, 2080, ...
    $ kids           <dbl> 2, 2, 1, 2, 5, 2, 3, 4, 3, 5, 98, 3, 0, 0, 2, 0...
    $ marital_status <chr> "married", "divorced", "married", "married", "m...
  2. Display some of the observations where there are more than 90 kids in the household. Chose several of the pertinent variables to display.

    psid %>%
      filter(kids > 90) %>%
      select(person_id, age, educatn, kids, marital_status) %>%
      print(n = 15)
    # A tibble: 118 x 5
       person_id   age educatn  kids marital_status
           <dbl> <dbl>   <dbl> <dbl> <chr>         
     1         3    48      13    98 divorced      
     2       186    41      12    98 married       
     3       178    49      12    98 married       
     4         5    34      99    99 no histories  
     5         3    34      12    98 divorced      
     6         2    47      12    98 divorced      
     7       182    49      12    99 no histories  
     8         3    48       3    98 never married 
     9        21    49       0    99 no histories  
    10       177    40       0    98 married       
    11         3    45      12    98 married       
    12         2    50       0    99 no histories  
    13       171    49       0    98 divorced      
    14       173    40       9    98 divorced      
    15       175    37       0    98 divorced      
    # ... with 103 more rows
  3. Create a copy of the data frame that removes the observations where married was no history or NA/DF. You may have combined these categories into a missing category in the preparatory exercises.

    psid_copy <-
      psid %>%
      filter(
        marital_status != "no history",
        marital_status != "NA/DF"
      )
    
    psid_copy %>%
      select(person_id, age, educatn, kids, marital_status) %>%
      print(n = 15)
    # A tibble: 4,847 x 5
       person_id   age educatn  kids marital_status
           <dbl> <dbl>   <dbl> <dbl> <chr>         
     1         4    39      12     2 married       
     2         6    35      12     2 divorced      
     3         7    33      12     1 married       
     4       173    39      10     2 married       
     5         2    47       9     5 married       
     6         4    44      12     2 married       
     7       172    38      16     3 married       
     8         4    38       9     4 divorced      
     9       170    39      12     3 married       
    10       171    37      11     5 married       
    11         3    48      13    98 divorced      
    12       171    47      12     3 married       
    13       178    40      12     0 separated     
    14       171    38      16     0 married       
    15         3    41      12     2 married       
    # ... with 4,832 more rows