【3.5.1】Pandas--Dataframe信息修改
一、例子
1.1 创建dataframe
import pandas as pd
import numpy as np
dates = pd.date_range('20170101', periods=8)
df = pd.DataFrame(np.random.randn(8,4), index=dates, columns=list('ABCD'))
print("df:")
print(df)
print('-'*50)
结果:
df:
A B C D
2017-01-01 -0.598774 1.076390 -0.642006 -0.089715
2017-01-02 -0.438976 1.063627 0.387825 1.312049
2017-01-03 0.101879 0.469225 0.860522 0.086417
2017-01-04 -0.670031 1.974935 -0.570337 0.478371
2017-01-05 0.250046 -1.385470 -0.893637 -1.786031
2017-01-06 0.876446 -0.167285 -0.475356 -0.145381
2017-01-07 0.291258 0.676994 -1.953909 -0.609507
2017-01-08 -0.569716 0.749637 1.038614 -0.502682
--------------------------------------------------
1.2 新增一列
例1
s=pd.Series(list(range(10,18)),index=pd.date_range('20170101', periods=8))
df["F"]=s#新加一列元素F
print("df['F']=s")
print(df)
print('-'*50)
结果:
df['F']=s
A B C D F
2017-01-01 -0.598774 1.076390 -0.642006 -0.089715 10
2017-01-02 -0.438976 1.063627 0.387825 1.312049 11
2017-01-03 0.101879 0.469225 0.860522 0.086417 12
2017-01-04 -0.670031 1.974935 -0.570337 0.478371 13
2017-01-05 0.250046 -1.385470 -0.893637 -1.786031 14
2017-01-06 0.876446 -0.167285 -0.475356 -0.145381 15
2017-01-07 0.291258 0.676994 -1.953909 -0.609507 16
2017-01-08 -0.569716 0.749637 1.038614 -0.502682 17
--------------------------------------------------
例2 合并两列信息
dataframe["newColumn"] = dataframe["age"].map(str) + dataframe["phone"] + dataframe["address”]
1.3 修改某个数值
方法一: at
修改第一行,“A"列的数据
df.at[dates[0],"A"]=99
print("df.at[dates[0],'A']=99")
print(df)
print('-'*50)
df.at[dates[0],'A']=99
A B C D F
2017-01-01 99.000000 1.076390 -0.642006 -0.089715 10
2017-01-02 -0.438976 1.063627 0.387825 1.312049 11
2017-01-03 0.101879 0.469225 0.860522 0.086417 12
2017-01-04 -0.670031 1.974935 -0.570337 0.478371 13
2017-01-05 0.250046 -1.385470 -0.893637 -1.786031 14
2017-01-06 0.876446 -0.167285 -0.475356 -0.145381 15
2017-01-07 0.291258 0.676994 -1.953909 -0.609507 16
2017-01-08 -0.569716 0.749637 1.038614 -0.502682 17
--------------------------------------------------
方法二: iat
修改第一行,第一列的数据
print("df.iat[1,1]=-66")
df.iat[1,1]=-66
print(df)
print('-'*50)
print("df.loc[:,'D']=np.array([4]*len(df))")
结果
df.iat[1,1]=-66
A B C D F
2017-01-01 99.000000 1.076390 -0.642006 -0.089715 10
2017-01-02 -0.438976 -66.000000 0.387825 1.312049 11
2017-01-03 0.101879 0.469225 0.860522 0.086417 12
2017-01-04 -0.670031 1.974935 -0.570337 0.478371 13
2017-01-05 0.250046 -1.385470 -0.893637 -1.786031 14
2017-01-06 0.876446 -0.167285 -0.475356 -0.145381 15
2017-01-07 0.291258 0.676994 -1.953909 -0.609507 16
2017-01-08 -0.569716 0.749637 1.038614 -0.502682 17
--------------------------------------------------
方法三: loc
新增”D“列
df.loc[:,"D"]=np.array([4]*len(df))
print(df)
print('-'*50)
df2=df.copy()#拷贝
print('-'*50)
结果:
df.loc[:,'D']=np.array([4]*len(df))
A B C D F
2017-01-01 99.000000 1.076390 -0.642006 4 10
2017-01-02 -0.438976 -66.000000 0.387825 4 11
2017-01-03 0.101879 0.469225 0.860522 4 12
2017-01-04 -0.670031 1.974935 -0.570337 4 13
2017-01-05 0.250046 -1.385470 -0.893637 4 14
2017-01-06 0.876446 -0.167285 -0.475356 4 15
2017-01-07 0.291258 0.676994 -1.953909 4 16
2017-01-08 -0.569716 0.749637 1.038614 4 17
--------------------------------------------------
--------------------------------------------------
包含某个字符串,就修改某列的信息
df.loc[df['sport'].str.contains('ball'), 'sport'] = 'ball sport'
df
Out[71]:
name sport
0 Bob tennis
1 Jane ball sport
2 Alice ball sport
方法四
df2[df2>0]=-df2#将df2中的所有大于0的元素值 都改为小于0的
print (df2)
结果
A B C D F
2017-01-01 -99.000000 -1.076390 -0.642006 -4 -10
2017-01-02 -0.438976 -66.000000 -0.387825 -4 -11
2017-01-03 -0.101879 -0.469225 -0.860522 -4 -12
2017-01-04 -0.670031 -1.974935 -0.570337 -4 -13
2017-01-05 -0.250046 -1.385470 -0.893637 -4 -14
2017-01-06 -0.876446 -0.167285 -0.475356 -4 -15
2017-01-07 -0.291258 -0.676994 -1.953909 -4 -16
2017-01-08 -0.569716 -0.749637 -1.038614 -4 -17
1.4 新增一行
from pandas import *
from random import *
df = DataFrame(columns=('lib', 'qty1', 'qty2'))#生成空的pandas表
for i in range(5):#插入一行
df.loc[i] = [randint(-1,1) for n in range(3)]
print df
1.5 对符合要求的列行进行修改
代码:
import pandas as pd
# df = pd.DataFrame({'a':[1,2,3]},'b':[2,3,4])
dt = {'one':[1,2,3,4],'two':[9,8,7,6],'three':[-1,-2,-3,-4]}
df = pd.DataFrame(dt,index =['a','b','c','d'])
# df[df['one']==2,'one'] = df[df['one']==2]['one'] * df[df['one']==2]['two']
print df
df.loc[df['one']==2,'one'] *=100
df.loc[((df['one']==3) & (df['three']==-3) ),'two'] *=50
df.loc[df['three']==-2,'three'] = abs(df['three'] )
print df
结果示例:
one three two
a 1 -1 9
b 2 -2 8
c 3 -3 7
d 4 -4 6
one three two
a 1 -1 9
b 200 2 8
c 3 -3 350
d 4 -4 6
1.6 na的填充
数据
df = pd.DataFrame([[np.nan, 2, np.nan, 0],
[3, 4, np.nan, 1],
[np.nan, np.nan, np.nan, 5],
[np.nan, 3, np.nan, 4]],
columns=list('ABCD'))
print df
A B C D
0 NaN 2.0 NaN 0
1 3.0 4.0 NaN 1
2 NaN NaN NaN 5
3 NaN 3.0 NaN 4
填充:
print df.fillna(0)
A B C D
0 0.0 2.0 0.0 0
1 3.0 4.0 0.0 1
2 0.0 0.0 0.0 5
3 0.0 3.0 0.0 4
某一列的填充
df['A'] = df['A'].fillna(0)
print df
A B C D
0 0.0 2.0 NaN 0
1 3.0 4.0 NaN 1
2 0.0 NaN NaN 5
3 0.0 3.0 NaN 4
根据前面的内容来填充后面的
df['A'] = df['A'].fillna(method='ffill')
1.7 为空则进行替换
df_merge['#OTU_1'] = np.where(df_merge['#OTU'].isnull(),
df_merge['#OTU_1'], df_merge['#OTU'])
1.8 pandas dataframe 修改某个数值排名前5的数值
import pandas as pd
# 创建示例数据
data = {'A': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
'B': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}
df = pd.DataFrame(data)
print(df)
# 对列'A'进行排序,找到排名前5的数值的索引
top5_indices = df['A'].argsort()[-5:]
# 修改排名前5的数值为新值
new_value = 999
df.loc[top5_indices, 'A'] = new_value
print("修改后的DataFrame:")
print(df)
二、讨论
用起来,是不是有点像R
参考资料
这里是一个广告位,,感兴趣的都可以发邮件聊聊:tiehan@sina.cn
个人公众号,比较懒,很少更新,可以在上面提问题,如果回复不及时,可发邮件给我: tiehan@sina.cn
个人公众号,比较懒,很少更新,可以在上面提问题,如果回复不及时,可发邮件给我: tiehan@sina.cn