Summarize Scalars or Matrices by Cross-Classification
summarize
is a fast version of summary.formula(formula,
method="cross",overall=FALSE)
for producing stratified summary statistics
and storing them in a data frame for plotting (especially with trellis
xyplot
and dotplot
and Hmisc xYplot
). Unlike
aggregate
, summarize
accepts a matrix as its first
argument and a multi-valued FUN
argument and summarize
also labels the variables in the new data
frame using their original names. Unlike methods based on
tapply
, summarize
stores the values of the stratification
variables using their original types, e.g., a numeric by
variable
will remain a numeric variable in the collapsed data frame.
summarize
also retains "label"
attributes for variables.
summarize
works especially well with the Hmisc xYplot
function for displaying multiple summaries of a single variable on each
panel, such as means and upper and lower confidence limits.
asNumericMatrix
converts a data frame into a numeric matrix,
saving attributes to reverse the process by matrix2dataframe
.
It saves attributes that are commonly preserved across row
subsetting (i.e., it does not save dim
, dimnames
, or
names
attributes).
matrix2dataFrame
converts a numeric matrix back into a data
frame if it was created by asNumericMatrix
.
summarize(X, by, FUN, ..., stat.name=deparse(substitute(X)), type=c('variables','matrix'), subset=TRUE, keepcolnames=FALSE) asNumericMatrix(x) matrix2dataFrame(x, at=attr(x, 'origAttributes'), restoreAll=TRUE)
X |
a vector or matrix capable of being operated on by the
function specified as the |
by |
one or more stratification variables. If a single
variable, |
FUN |
a function of a single vector argument, used to create the statistical
summaries for |
... |
extra arguments are passed to |
stat.name |
the name to use when creating the main summary variable. By default,
the name of the |
type |
Specify |
subset |
a logical vector or integer vector of subscripts used to specify the subset of data to use in the analysis. The default is to use all observations in the data frame. |
keepcolnames |
by default when |
x |
a data frame (for |
at |
List containing attributes of original data frame that survive
subsetting. Defaults to attribute |
restoreAll |
set to |
For summarize
, a data frame containing the by
variables and the
statistical summaries (the first of which is named the same as the X
variable unless stat.name
is given). If type="matrix"
, the
summaries are stored in a single variable in the data frame, and this
variable is a matrix.
asNumericMatrix
returns a numeric matrix and stores an object
origAttributes
as an attribute of the returned object, with original
attributes of component variables, the storage.mode
.
matrix2dataFrame
returns a data frame.
Frank Harrell
Department of Biostatistics
Vanderbilt University
fh@fharrell.com
## Not run: s <- summarize(ap>1, llist(size=cut2(sz, g=4), bone), mean, stat.name='Proportion') dotplot(Proportion ~ size | bone, data=s7) ## End(Not run) set.seed(1) temperature <- rnorm(300, 70, 10) month <- sample(1:12, 300, TRUE) year <- sample(2000:2001, 300, TRUE) g <- function(x)c(Mean=mean(x,na.rm=TRUE),Median=median(x,na.rm=TRUE)) summarize(temperature, month, g) mApply(temperature, month, g) mApply(temperature, month, mean, na.rm=TRUE) w <- summarize(temperature, month, mean, na.rm=TRUE) library(lattice) xyplot(temperature ~ month, data=w) # plot mean temperature by month w <- summarize(temperature, llist(year,month), quantile, probs=c(.5,.25,.75), na.rm=TRUE, type='matrix') xYplot(Cbind(temperature[,1],temperature[,-1]) ~ month | year, data=w) mApply(temperature, llist(year,month), quantile, probs=c(.5,.25,.75), na.rm=TRUE) # Compute the median and outer quartiles. The outer quartiles are # displayed using "error bars" set.seed(111) dfr <- expand.grid(month=1:12, year=c(1997,1998), reps=1:100) attach(dfr) y <- abs(month-6.5) + 2*runif(length(month)) + year-1997 s <- summarize(y, llist(month,year), smedian.hilow, conf.int=.5) s mApply(y, llist(month,year), smedian.hilow, conf.int=.5) xYplot(Cbind(y,Lower,Upper) ~ month, groups=year, data=s, keys='lines', method='alt') # Can also do: s <- summarize(y, llist(month,year), quantile, probs=c(.5,.25,.75), stat.name=c('y','Q1','Q3')) xYplot(Cbind(y, Q1, Q3) ~ month, groups=year, data=s, keys='lines') # To display means and bootstrapped nonparametric confidence intervals # use for example: s <- summarize(y, llist(month,year), smean.cl.boot) xYplot(Cbind(y, Lower, Upper) ~ month | year, data=s) # For each subject use the trapezoidal rule to compute the area under # the (time,response) curve using the Hmisc trap.rule function x <- cbind(time=c(1,2,4,7, 1,3,5,10),response=c(1,3,2,4, 1,3,2,4)) subject <- c(rep(1,4),rep(2,4)) trap.rule(x[1:4,1],x[1:4,2]) summarize(x, subject, function(y) trap.rule(y[,1],y[,2])) ## Not run: # Another approach would be to properly re-shape the mm array below # This assumes no missing cells. There are many other approaches. # mApply will do this well while allowing for missing cells. m <- tapply(y, list(year,month), quantile, probs=c(.25,.5,.75)) mm <- array(unlist(m), dim=c(3,2,12), dimnames=list(c('lower','median','upper'),c('1997','1998'), as.character(1:12))) # aggregate will help but it only allows you to compute one quantile # at a time; see also the Hmisc mApply function dframe <- aggregate(y, list(Year=year,Month=month), quantile, probs=.5) # Compute expected life length by race assuming an exponential # distribution - can also use summarize g <- function(y) { # computations for one race group futime <- y[,1]; event <- y[,2] sum(futime)/sum(event) # assume event=1 for death, 0=alive } mApply(cbind(followup.time, death), race, g) # To run mApply on a data frame: xn <- asNumericMatrix(x) m <- mApply(xn, race, h) # Here assume h is a function that returns a matrix similar to x matrix2dataFrame(m) # Get stratified weighted means g <- function(y) wtd.mean(y[,1],y[,2]) summarize(cbind(y, wts), llist(sex,race), g, stat.name='y') mApply(cbind(y,wts), llist(sex,race), g) # Compare speed of mApply vs. by for computing d <- data.frame(sex=sample(c('female','male'),100000,TRUE), country=sample(letters,100000,TRUE), y1=runif(100000), y2=runif(100000)) g <- function(x) { y <- c(median(x[,'y1']-x[,'y2']), med.sum =median(x[,'y1']+x[,'y2'])) names(y) <- c('med.diff','med.sum') y } system.time(by(d, llist(sex=d$sex,country=d$country), g)) system.time({ x <- asNumericMatrix(d) a <- subsAttr(d) m <- mApply(x, llist(sex=d$sex,country=d$country), g) }) system.time({ x <- asNumericMatrix(d) summarize(x, llist(sex=d$sex, country=d$country), g) }) # An example where each subject has one record per diagnosis but sex of # subject is duplicated for all the rows a subject has. Get the cross- # classified frequencies of diagnosis (dx) by sex and plot the results # with a dot plot count <- rep(1,length(dx)) d <- summarize(count, llist(dx,sex), sum) Dotplot(dx ~ count | sex, data=d) ## End(Not run) d <- list(x=1:10, a=factor(rep(c('a','b'), 5)), b=structure(letters[1:10], label='label for b'), d=c(rep(TRUE,9), FALSE), f=pi*(1 : 10)) x <- asNumericMatrix(d) attr(x, 'origAttributes') matrix2dataFrame(x) detach('dfr') # Run summarize on a matrix to get column means x <- c(1:19,NA) y <- 101:120 z <- cbind(x, y) g <- c(rep(1, 10), rep(2, 10)) summarize(z, g, colMeans, na.rm=TRUE, stat.name='x') # Also works on an all numeric data frame summarize(as.data.frame(z), g, colMeans, na.rm=TRUE, stat.name='x')
Please choose more modern alternatives, such as Google Chrome or Mozilla Firefox.