4.5 Dropping unneeded observations

SSCC - Social Science Computing Cooperative

Supporting Statistical Analysis for Research

These exercises use the PSID.csv data set that was imported in the prior section.

Import the PSID.csv data set.

library(tidyverse)

psid_path <- file.path("..", "datasets", "PSID.csv")
psid_in <- read_csv(psid_path, col_types = cols())

Warning: Missing column names filled in: 'X1' [1]

psid_in <-
  rename(
    psid_in,
    obs_num = X1,
    intvw_num = intnum,
    person_id = persnum,
    marital_status = married
    )

psid <-
  psid_in %>%
  select(-obs_num)
glimpse(psid)

Observations: 4,856
Variables: 8
$ intvw_num      <dbl> 4, 4, 4, 4, 5, 6, 6, 7, 7, 7, 10, 10, 10, 11, 1...
$ person_id      <dbl> 4, 6, 7, 173, 2, 4, 172, 4, 170, 171, 3, 171, 1...
$ age            <dbl> 39, 35, 33, 39, 47, 44, 38, 38, 39, 37, 48, 47,...
$ educatn        <dbl> 12, 12, 12, 10, 9, 12, 16, 9, 12, 11, 13, 12, 1...
$ earnings       <dbl> 77250, 12000, 8000, 15000, 6500, 6500, 7000, 50...
$ hours          <dbl> 2940, 2040, 693, 1904, 1683, 2024, 1144, 2080, ...
$ kids           <dbl> 2, 2, 1, 2, 5, 2, 3, 4, 3, 5, 98, 3, 0, 0, 2, 0...
$ marital_status <chr> "married", "divorced", "married", "married", "m...

Display some of the observations where there are more than 90 kids in the household. Chose several of the pertinent variables to display.

psid %>%
  filter(kids > 90) %>%
  select(person_id, age, educatn, kids, marital_status) %>%
  print(n = 15)

# A tibble: 118 x 5
   person_id   age educatn  kids marital_status
       <dbl> <dbl>   <dbl> <dbl> <chr>         
 1         3    48      13    98 divorced      
 2       186    41      12    98 married       
 3       178    49      12    98 married       
 4         5    34      99    99 no histories  
 5         3    34      12    98 divorced      
 6         2    47      12    98 divorced      
 7       182    49      12    99 no histories  
 8         3    48       3    98 never married 
 9        21    49       0    99 no histories  
10       177    40       0    98 married       
11         3    45      12    98 married       
12         2    50       0    99 no histories  
13       171    49       0    98 divorced      
14       173    40       9    98 divorced      
15       175    37       0    98 divorced      
# ... with 103 more rows

Create a copy of the data frame that removes the observations where married was no history or NA/DF. You may have combined these categories into a missing category in the preparatory exercises.

psid_copy <-
  psid %>%
  filter(
    marital_status != "no history",
    marital_status != "NA/DF"
  )

psid_copy %>%
  select(person_id, age, educatn, kids, marital_status) %>%
  print(n = 15)

# A tibble: 4,847 x 5
   person_id   age educatn  kids marital_status
       <dbl> <dbl>   <dbl> <dbl> <chr>         
 1         4    39      12     2 married       
 2         6    35      12     2 divorced      
 3         7    33      12     1 married       
 4       173    39      10     2 married       
 5         2    47       9     5 married       
 6         4    44      12     2 married       
 7       172    38      16     3 married       
 8         4    38       9     4 divorced      
 9       170    39      12     3 married       
10       171    37      11     5 married       
11         3    48      13    98 divorced      
12       171    47      12     3 married       
13       178    40      12     0 separated     
14       171    38      16     0 married       
15         3    41      12     2 married       
# ... with 4,832 more rows