# Writing a function to scale (center/standardize) # all the numeric variables in a data frame # # (1) Write an example that works # (2) Turn that into a function # (3) Test and refine # # First illustrate the general problem: scale() does not # work with data frames library(faraway) scale(hsb) # Step (1) # make a copy of the data frame x <- hsb # figure out which columns are scalable cols <- sapply(hsb, is.numeric) # use scale() on those columns # this results in a matrix scaledvars <- scale(hsb[, cols]) # use the matrix to write back to the data frame x[, cols] <- scaledvars # check the results head(x) str(x) rm(x, scaledvars, cols) # cleanup # Step (2) scale_df <- function(dfr) { x <- dfr cols <- sapply(dfr, is.numeric) scaledvars <- scale(dfr[, cols]) x[, cols] <- scaledvars return(x) # or just "x" } # Step (3) z <- scale_df(hsb) head(z) rm(z, scale_df) # Refinement: make sure "dfr" is a data frame! scale_df2 <- function(dfr) { if (!is.data.frame(dfr)) {stop("dfr must be a data frame")} x <- dfr cols <- sapply(dfr, is.numeric) scaledvars <- scale(dfr[, cols]) x[, cols] <- scaledvars return(x) } z <- scale_df2(hsb) head(z) scale_df2(hsb$math) rm(z, scale_df2) # Refinement two # We'll chose a function name that makes this a "method" of # the generic function, scale() scale.data.frame <- function(dfr) { if (!is.data.frame(dfr)) {stop("dfr must be a data frame")} x <- dfr cols <- sapply(dfr, is.numeric) scaledvars <- scale.default(dfr[, cols]) # otherwise we get a recursive loop x[, cols] <- scaledvars return(x) } z <- scale.data.frame(hsb) head(z) scale.data.frame(hsb$math) rm(z) # Here is the magic! z <- scale(hsb) head(z) str(z) scale(hsb$math) # this now works by the default method # notice the attributes at the end rm(z) # Refinement three - better error message, keep attributes scale.data.frame <- function(dfr) { if (!is.data.frame(dfr)) {stop(paste(deparse(substitute(dfr)), "must be a data frame"))} x <- dfr cols <- sapply(dfr, is.numeric) scaledvars <- scale.default(dfr[, cols]) # otherwise we get a recursive loop x[, cols] <- scaledvars attr(x, "scaled:center") <- attr(scaledvars, "scaled:center") attr(x, "scaled:scale") <- attr(scaledvars, "scaled:scale") return(x) } # The error message will only be used if someone tries to bypass scale() and # use scale.data.frame() directly z <- scale(hsb) head(z) str(z) scale.data.frame(hsb$math) rm(z) ## Refinement four: pass parameters scale.data.frame <- function(dfr, ...) { if (!is.data.frame(dfr)) {stop(paste(deparse(substitute(dfr)), "must be a data frame"))} x <- dfr cols <- sapply(dfr, is.numeric) scaledvars <- scale.default(dfr[, cols], ...) # otherwise we get a recursive loop x[, cols] <- scaledvars attr(x, "scaled:center") <- attr(scaledvars, "scaled:center") attr(x, "scaled:scale") <- attr(scaledvars, "scaled:scale") return(x) } z <- scale(hsb, scale=FALSE) str(z)