# 【3.8】subset/which/[--筛选数据和提取

[]、which与subset是常用的从数据框中提取数据的命令，本博文讨论一下其常规的一些用法

## 一、[ ]数据的提取

[ 用来提取对象相同的类型，可以包含不止一个元素

[[用来提取额 list或data frame里面的元素，一般只能提取单个的元素

$通过名字来提取 list或data frame的元素; > x <- c("a", "b", "c", "c", "d", "a") > x[1] [1] "a" > x[2] [1] "b" > x[1:4] [1] "a" "b" "c" "c" > x[x > "a"] [1] "b" "c" "c" "d" > u <- x > "a" > u [1] FALSE TRUE TRUE TRUE TRUE FALSE > x[u] [1] "b" "c" "c" "d"  矩阵通过 (i ; j)来提取 x <- matrix(1:6, 2, 3) > x[1, 2] [1] 3 > x[2, 1] [1] 2 Indices can also be missing. > x[1, ] [1] 1 3 5 > x[, 2] [1] 3 4 通过<span style="color: #ff0000;">drop = FALSE</span>，让提取出来的元素也是一个矩阵的形式。 > x <- matrix(1:6, 2, 3) > x[1, 2] [1] 3 > x[1, 2, drop = FALSE] [,1] [1,] 3 > x <- matrix(1:6, 2, 3) > x[1, ] [1] 1 3 5 > x[1, , drop = FALSE] [,1] [,2] [,3] [1,] 1 3 5  list的提取 > x <- list(foo = 1:4, bar = 0.6) > x[1]$foo
[1] 1 2 3 4
> x[[1]]
[1] 1 2 3 4
> x$bar [1] 0.6 > x[["bar"]] [1] 0.6 > x["bar"]$bar
[1] 0.6


> x <- list(foo = 1:4, bar = 0.6, baz = "hello")
> x[c(1, 3)]
$foo [1] 1 2 3 4$baz
[1] "hello"

$仅仅适用于原始的名字，如果改名字了，还是得用[[]] > x <- list(foo = 1:4, bar = 0.6, baz = "hello") > name <- "foo" > x[[name]] ## computed index for foo' [1] 1 2 3 4 > x$name ## element name' doesn't exist!
NULL
> x$foo [1] 1 2 3 4 ## element foo' does exist The [[ can take an integer sequence. > x <- list(a = list(10, 12, 14), b = c(3.14, 2.81)) > x[[c(1, 3)]] [1] 14 > x[[1]][[3]] [1] 14 > x[[c(2, 1)]] [1] 3.14 [[和$可以匹配部分名字.
> x <- list(aardvark = 1:5)
> x$a [1] 1 2 3 4 5 > x[["a"]] NULL > x[["a", exact = FALSE]] [1] 1 2 3 4 5  删掉丢失的数值(NAs). > x <- c(1, 2, NA, 4, NA, 5) > bad <- is.na(x) > x[!bad] [1] 1 2 4 5  许多不同类型的数据来去掉NA > x <- c(1, 2, NA, 4, NA, 5) > y <- c("a", "b", NA, "d", NA, "f") > good <- complete.cases(x, y) > good [1] TRUE TRUE FALSE TRUE FALSE TRUE > x[good] [1] 1 2 4 5 > y[good] [1] "a" "b" "d" "f" > airquality[1:6, ] Ozone Solar.R Wind Temp Month Day 1 41 190 7.4 67 5 1 2 36 118 8.0 72 5 2 3 12 149 12.6 74 5 3 4 18 313 11.5 62 5 4 5 NA NA 14.3 56 5 5 6 28 NA 14.9 66 5 6 > good <- complete.cases(airquality) > airquality[good, ][1:6, ] ##非常强大的complete.cases Ozone Solar.R Wind Temp Month Day 1 41 190 7.4 67 5 1 2 36 118 8.0 72 5 2 3 12 149 12.6 74 5 3 4 18 313 11.5 62 5 4 7 23 299 8.6 65 5 7  ### 根据第几列或者列的名字进行提取 #选出df第1、3、5列 ( df <- df[,c(1,3,5)] ) > df.tmp <- df[,c(1,3,5)] > df.tmp ID Chinese English 1 1 65 23 2 2 37 45 3 3 65 67  ## 二、which的用法 > zz<-c(5,2,-3,8,12,1) > which(zz*zz>8) [1] 1 3 4 5 which反馈回来的满足要求的索引号 Ozone Solar.R Wind Temp Month Day 1 41 190 7.4 67 5 1 2 36 118 8.0 72 5 2 3 12 149 12.6 74 5 3 4 18 313 11.5 62 5 4 5 NA NA 14.3 56 5 5 6 28 NA 14.9 66 5 6 Extract the subset of rows of the data frame where Ozone values are above 31 and Temp values are above 90 > d<- a[which(a$Ozone>31 & a$Temp>90),] 提取当Month等于6的列表， f<- a[which(a$Month==6),]


arc_b_1<-arc_b[which(arc_b[,2]>0.01 |arc_b[,3]>0.01 |arc_b[,4]>0.01 |arc_b[,5]>0.01 |arc_b[,6]>0.01 |arc_b[,7]>0.01 |arc_b[,8]>0.01 |arc_b[,9]>0.01 |arc_b[,10]>0.01 |arc_b[,11]>0.01 |arc_b[,12]>0.01 |arc_b[,13]>0.01 |arc_b[,14]>0.01 |arc_b[,15]>0.01 |arc_b[,16]>0.01 |arc_b[,17]>0.01 |arc_b[,18]>0.01 |arc_b[,19]>0.01 |arc_b[,20]>0.01 |arc_b[,21]>0.01 |arc_b[,22]>0.01 |arc_b[,23]>0.01 |arc_b[,24]>0.01),]


myfun<-function(x){sum(x>0.01)};
t80<-all_80_percent[,2:(nn+1)];
all_80_percent_1<-all_80_percent[as.logical(apply(t80,1,myfun)),]

myfun<-function(x){sum(x>=0.01)};
m<-(147-99+1)*0.7;
cun_5<-bac_b[(apply(total,1,myfun)>=m),];

Selecting (Keeping) Variables
# select variables v1, v2, v3
myvars <- c("v1", "v2", "v3")
newdata <- mydata[myvars]
注意这里用的是[ ]

# another method
myvars <- <span style="color: #ff0000;">paste</span>("v", 1:3, sep="")
newdata <- mydata[myvars]

# select 1st and 5th thru 10th variables
newdata <- mydata[c(1,5:10)]

Excluding (DROPPING) Variables
# exclude variables v1, v2, v3
myvars <- names(mydata) %in% c("v1", "v2", "v3")
newdata <- mydata[!myvars]
# exclude 3rd and 5th variable
newdata <- mydata[c(-3,-5)]
注：符号和！表示“非”的意思
# delete variables v3 and v5

mydata$v3 <- mydata$v5 <- NULL
Selecting Observations
# first 5 observerations
newdata <- mydata[1:5,]
# based on variable values
newdata <- mydata[ which(mydata$gender=='F' & mydata$age > 65), ]
# or
attach(newdata)
newdata <- mydata[ which(gender=='F' & age > 65),]
detach(newdata)
Selection using the Subset Function

重新提取数据的奇数行
nii<-ni[seq(1,nrow(ni),2),]


aa$a = c(6,1,4,5,5,1) if (length(which(aa$a ==100))) print("zzz")
if (length(which(aa$a ==6))) print("zzz") if (!length(which(aa$a ==6))) print("zzz")


## 三 、Subset

#using subset function we select all rows that have a value of age greater than or equal to 20 or age less then 10. We keep the ID and Weight columns.

newdata <- subset(mydata, age >= 20 | age < 10,select=c(ID, Weight)）


In the next example, we select all men over the age of 25 and we keep variables weight through income (weight, income and all columns between them).

# using subset function (part 2)

newdata <- subset(mydata, sex=="m" & age > 25,select=weight:income)
注：<span style="color: #ff0000;">&表示“和”，“|”表示或</span>


Random Samples Use the sample( ) function to take a random sample of size n from a dataset. #take a random sample of size 50 from a dataset mydata #sample without replacement

mysample <- mydata[sample(1:nrow(mydata), 50,replace=FALSE),]

library(gcookbook) # For the data set
cdat <- subset(countries, Year==2009 &
Name %in% c("Canada", "Ireland", "United Kingdom", "United States"))

library(gcookbook) # For the data set
c2009 <- subset(countries, Year==2009,
select=c(Name, GDP, laborrate, healthexp, infmortality))

pairs(c2009[,2:5])


## 四、subset筛选与索引筛选的区别

> x<-c(6,1:3,NA,12)
> x
[1] 6 1 2 3 NA 12
> x[x>5]
[1] 6 NA 12
> subset(x,x>5)
[1] 6 12


## 五、案例分析

1.测序公司一般给的思路就是针对一个Otu,算出他在所有样品的丰度之和，然后按照丰度大小，选取前100个Otu来层现（上海某家测序公司）

2.选缺只要在一个样品中丰度值大于0.01的这样的otu，这是我现在更倾向的问题，至于这个阈值是多少，可以人为的设定，那好，针对第二种策略用R如何实现呢？？

xx<-rnorm(10) > xx
[1]  0.20129745 0.56048658 -0.03100652
[4] -2.14841137 -0.33119550-0.98065025
[7] -0.97097716 -0.53269168-0.16772913
[10]  0.53356301

> x1<-matrix(xx,5) > x1
[,1]       [,2]
[1,]  0.20129745 -0.9806503
[2,]  0.56048658 -0.9709772
[3,] -0.03100652 -0.5326917
[4,] -2.14841137 -0.1677291
[5,] -0.33119550  0.5335630

> x2<-matrix(0.01,5,2) > x2
[,1] [,2]
[1,] 0.01 0.01
[2,] 0.01 0.01
[3,] 0.01 0.01
[4,] 0.01 0.01
[5,] 0.01 0.01

> yyx1
[,1]  [,2]
[1,]  TRUE FALSE
[2,]  TRUE FALSE
[3,] FALSE FALSE
[4,] FALSE FALSE
[5,] FALSE  TRUE

> colSums(yy)
[1] 2 1

> rowSums(yy)
[1] 1 1 0 0 1

> x3<-x1[as.logical(rowSums(yy)),] > x3
[,1]       [,2]
[1,]  0.2012974 -0.9806503
[2,]  0.5604866 -0.9709772
[3,] -0.3311955  0.5335630


myfun<-function(x){sum(x>0.01)}
> apply(x1,2,myfun)
[1] 2 1

> apply(x1,1,myfun)
[1] 1 1 0 0 1
x4<-x1[as.logical(apply(x1,1,myfun)),] > x4
[,1]       [,2]
[1,]  0.2012974 -0.9806503
[2,]  0.5604866 -0.9709772
[3,] -0.3311955  0.5335630


otus1<-otus[,2:24]; > nrow(otus1)
[1] 9143
> nrow(otus1)
[1] 9143

for (i in 1:23) {
for (m in 1:9143){
if (otus1[m,i]>0) otus1[m,i]


nn=mnn),]

> x4
[,1]       [,2]
[1,]  0.20129745 -0.9806503
[2,]  0.56048658 -0.9709772
[3,] -0.03100652 -0.5326917
[4,] -2.14841137 -0.1677291
[5,] -0.33119550  0.5335630
`