# 【3】数据分析--10--科学计算--Pandas--3--Dataframe信息判断

content:

1. 判断是否为空
2. 判断某列是否包含某个字符串
3. 判断某列是否包含某些字符串
4. 判断某个dataframe中的元素是否在另一个dataframe里面
5. 多列的值进行判断，形成新的列

## 一、判断是否为空

pandas 空值定义为numpy.nan

eg:

pd.isnull(df1) #df1是dataframe变量


eg: np.isnan(df1.ix[0,3]) #对df1的第0行第3列判断


## 二、判断某列是否包含某个字符串

### 方法一

#将元素转换成str
df_test['b'] = df_test['b'].astype(str)
# 找到b列中含有的‘exp’的所有行
df_enw = df_test[df_test['b'].str.contains('exp')]


### 方法二

import numpy as np
import pandas as pd

data = {'city': ['Beijing', 'Shanghai', 'Guangzhou', 'Shenzhen', 'Hangzhou', 'Chongqing'],
'year': [2016,2016,2015,2017,2016, 2016],
'population': [2100, 2300, 1000, 700, 500, 500]}
frame = pd.DataFrame(data, columns = ['year', 'city', 'population', 'debt'])

print frame, '\n'

frame['panduan'] = frame.city.apply(lambda x: 1 if 'ing' in x else 0)
print frame


   year       city  population debt
0  2016    Beijing        2100  NaN
1  2016   Shanghai        2300  NaN
2  2015  Guangzhou        1000  NaN
3  2017   Shenzhen         700  NaN
4  2016   Hangzhou         500  NaN
5  2016  Chongqing         500  NaN

year       city  population debt  panduan
0  2016    Beijing        2100  NaN        1
1  2016   Shanghai        2300  NaN        0
2  2015  Guangzhou        1000  NaN        0
3  2017   Shenzhen         700  NaN        0
4  2016   Hangzhou         500  NaN        0
5  2016  Chongqing         500  NaN        1


>>> searchfor = ['og', 'at']
>>> df[df['aa'].str.contains('|'.join(searchfor))]
0    cat
1    hat
2    dog
3    fog
dtype: object


## 四、判断某个dataframe中的元素是否在另一个dataframe里面

df_4 = df_1[(~df_1.pdb.isin(df_2.pdb))]


## 五、多列的值进行判断，形成新的列

### 示例1

import numpy as np
import pandas as pd

data = {'city': ['Beijing', 'Shanghai', 'Guangzhou', 'Shenzhen', 'Hangzhou', 'Chongqing'],
'year': [2016,2016,2015,2017,2016, 2016],
'population': [2100, 2300, 1000, 700, 500, 500]}
frame = pd.DataFrame(data, columns = ['year', 'city', 'population', 'debt'])

def function(a, b):
if 'ing' in a and b == 2016:
return 1
else:
return 0
print frame, '\n'

frame['test'] = frame.apply(lambda x: function(x.city, x.year), axis = 1)
print frame


   year       city  population debt
0  2016    Beijing        2100  NaN
1  2016   Shanghai        2300  NaN
2  2015  Guangzhou        1000  NaN
3  2017   Shenzhen         700  NaN
4  2016   Hangzhou         500  NaN
5  2016  Chongqing         500  NaN

year       city  population debt  test
0  2016    Beijing        2100  NaN     1
1  2016   Shanghai        2300  NaN     0
2  2015  Guangzhou        1000  NaN     0
3  2017   Shenzhen         700  NaN     0
4  2016   Hangzhou         500  NaN     0
5  2016  Chongqing         500  NaN     1


### 示例2（更简单的可以用来判断两列是否相等）

series1 = pd.Series([1,2,3,4,5])
series2 = pd.Series([1,3,3,4,6])

data1 = pd.DataFrame([series1,series2])
data_frame = pd.DataFrame(index=[], columns=['column1', 'column2'])
data_frame['column1'] = series1
data_frame['column2'] = series2

data_frame['bool'] = data_frame['column1'] == data_frame['column2']
print (data_frame)


   column1  column2   bool
0        1        1   True
1        2        3  False
2        3        3   True
3        4        4   True
4        5        6  False


### 示例三（挺有意思的范例）

df1 = pd.DataFrame({'col1':['audi','cars']})
df2 = pd.DataFrame({'col2':['audi','bike']})

df = pd.concat([df1, df2], axis=1)
df['result'] = np.where(df['col1'] == df['col2'], 'no change', 'changed')
print (df)


   col1  col2     result
0  audi  audi  no change
1  cars  bike    changed


## 六、每行中为0的元素的个数

in[34]:df = pd.DataFrame({'a':[1,0,0,1,3],'b':[0,0,1,0,1],'c':[0,0,0,0,0]})
in[35]:df
Out[35]:
a  b  c
0  1  0  0
1  0  0  0
2  0  1  0
3  1  0  0
4  3  1  0

df.apply(lambda x : x.value_counts().get(0,0),axis=1)

Out[40]:
0    2
1    3
2    2
3    2
4    1