【3.5.2】Summarizing data

目前有一批数据,在用ggplot作图前,我需要先知道它的平均值和方差是多少,这个时候就需要对数据预先处理一下。

目前的策略有三种:

  1. ddply() 需要 plyr包
  2. summarizeBy() 需要doBy包
  3. aggregate() R内置的函数,但用起来麻烦

一、测试数据:

data <- read.table(header=TRUE, text='
 subject sex condition before after change
	   1   F   placebo   10.1   6.9   -3.2
	   2   F   placebo    6.3   4.2   -2.1
	   3   M   aspirin   12.4   6.3   -6.1
	   4   F   placebo    8.1   6.1   -2.0
	   5   M   aspirin   15.2   9.9   -5.3
	   6   F   aspirin   10.9   7.0   -3.9
	   7   F   aspirin   11.6   8.5   -3.1
	   8   M   aspirin    9.5   3.0   -6.5
	   9   F   placebo   11.5   9.0   -2.5
	  10   M   placebo   11.9  11.0   -0.9
	  11   F   aspirin   11.4   8.0   -3.4
	  12   M   aspirin   10.0   4.4   -5.6
	  13   M   aspirin   12.5   5.4   -7.1
	  14   M   placebo   10.6  10.6    0.0
	  15   M   aspirin    9.1   4.3   -4.8
	  16   F   placebo   12.1  10.2   -1.9
	  17   F   placebo   11.0   8.8   -2.2
	  18   F   placebo   11.9  10.2   -1.7
	  19   M   aspirin    9.1   3.6   -5.5
	  20   M   placebo   13.5  12.4   -1.1
	  21   M   aspirin   12.0   7.5   -4.5
	  22   F   placebo    9.1   7.6   -1.5
	  23   M   placebo    9.9   8.0   -1.9
	  24   F   placebo    7.6   5.2   -2.4
	  25   F   placebo   11.8   9.7   -2.1
	  26   F   placebo   11.8  10.7   -1.1
	  27   F   aspirin   10.1   7.9   -2.2
	  28   M   aspirin   11.6   8.3   -3.3
	  29   F   aspirin   11.3   6.8   -4.5
	  30   F   placebo   10.3   8.3   -2.0
 ')

二、ddply

library(plyr)

# Run the functions length, mean, and sd on the value of "change" for each group, 
# broken down by sex + condition
cdata <- ddply(data, c("sex", "condition"), summarise,
			   N    = length(change),
			   mean = mean(change),
			   sd   = sd(change),
			   se   = sd / sqrt(N)
)
cdata
#>   sex condition  N      mean        sd        se
#> 1   F   aspirin  5 -3.420000 0.8642916 0.3865230
#> 2   F   placebo 12 -2.058333 0.5247655 0.1514867
#> 3   M   aspirin  9 -5.411111 1.1307569 0.3769190
#> 4   M   placebo  4 -0.975000 0.7804913 0.3902456

处理NA数据,因为lenght不具有na.rm选项

# Put some NA's in the data
dataNA <- data
dataNA$change[11:14] <- NA

cdata <- ddply(dataNA, c("sex", "condition"), summarise,
			   N    = sum(!is.na(change)),
			   mean = mean(change, na.rm=TRUE),
			   sd   = sd(change, na.rm=TRUE),
			   se   = sd / sqrt(N)
)
cdata
#>   sex condition  N      mean        sd        se
#> 1   F   aspirin  4 -3.425000 0.9979145 0.4989572
#> 2   F   placebo 12 -2.058333 0.5247655 0.1514867
#> 3   M   aspirin  7 -5.142857 1.0674848 0.4034713
#> 4   M   placebo  3 -1.300000 0.5291503 0.3055050

升级版的ddply

## Summarizes data.
## Gives count, mean, standard deviation, standard error of the mean, and confidence interval (default 95%).
##   data: a data frame.
##   measurevar: the name of a column that contains the variable to be summariezed
##   groupvars: a vector containing names of columns that contain grouping variables
##   na.rm: a boolean that indicates whether to ignore NA's
##   conf.interval: the percent range of the confidence interval (default is 95%)
summarySE <- function(data=NULL, measurevar, groupvars=NULL, na.rm=FALSE,
					  conf.interval=.95, .drop=TRUE) {
	library(plyr)

	# New version of length which can handle NA's: if na.rm==T, don't count them
	length2 <- function (x, na.rm=FALSE) {
		if (na.rm) sum(!is.na(x))
		else       length(x)
	}

	# This does the summary. For each group's data frame, return a vector with
	# N, mean, and sd
	datac <- ddply(data, groupvars, .drop=.drop,
	  .fun = function(xx, col) {
		c(N    = length2(xx[[col]], na.rm=na.rm),
		  mean = mean   (xx[[col]], na.rm=na.rm),
		  sd   = sd     (xx[[col]], na.rm=na.rm)
		)
	  },
	  measurevar
	)

	# Rename the "mean" column    
	datac <- rename(datac, c("mean" = measurevar))

	datac$se <- datac$sd / sqrt(datac$N)  # Calculate standard error of the mean

	# Confidence interval multiplier for standard error
	# Calculate t-statistic for confidence interval: 
	# e.g., if conf.interval is .95, use .975 (above/below), and use df=N-1
	ciMult <- qt(conf.interval/2 + .5, datac$N-1)
	datac$ci <- datac$se * ciMult

	return(datac)
}

summarySE的使用

summarySE(data, measurevar="change", groupvars=c("sex", "condition"))
#>   sex condition  N    change        sd        se        ci
#> 1   F   aspirin  5 -3.420000 0.8642916 0.3865230 1.0731598
#> 2   F   placebo 12 -2.058333 0.5247655 0.1514867 0.3334201
#> 3   M   aspirin  9 -5.411111 1.1307569 0.3769190 0.8691767
#> 4   M   placebo  4 -0.975000 0.7804913 0.3902456 1.2419358

# With a data set with NA's, use na.rm=TRUE
summarySE(dataNA, measurevar="change", groupvars=c("sex", "condition"), na.rm=TRUE)
#>   sex condition  N    change        sd        se        ci
#> 1   F   aspirin  4 -3.425000 0.9979145 0.4989572 1.5879046
#> 2   F   placebo 12 -2.058333 0.5247655 0.1514867 0.3334201
#> 3   M   aspirin  7 -5.142857 1.0674848 0.4034713 0.9872588
#> 4   M   placebo  3 -1.300000 0.5291503 0.3055050 1.3144821

参考资料:

http://www.cookbook-r.com/Manipulating_data/Summarizing_data/

药企,独角兽,苏州。团队长期招人,感兴趣的都可以发邮件聊聊:tiehan@sina.cn
个人公众号,比较懒,很少更新,可以在上面提问题,如果回复不及时,可发邮件给我: tiehan@sina.cn