# 【3】数据分析--10--科学计算--Pandas--3--Dataframe(合并，转换，拼接)

## 一、 concat

In [1]: df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
...:                     'B': ['B0', 'B1', 'B2', 'B3'],
...:                     'C': ['C0', 'C1', 'C2', 'C3'],
...:                     'D': ['D0', 'D1', 'D2', 'D3']},
...:                     index=[0, 1, 2, 3])
...:

In [2]: df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
...:                     'B': ['B4', 'B5', 'B6', 'B7'],
...:                     'C': ['C4', 'C5', 'C6', 'C7'],
...:                     'D': ['D4', 'D5', 'D6', 'D7']},
...:                      index=[4, 5, 6, 7])
...:

In [3]: df3 = pd.DataFrame({'A': ['A8', 'A9', 'A10', 'A11'],
...:                     'B': ['B8', 'B9', 'B10', 'B11'],
...:                     'C': ['C8', 'C9', 'C10', 'C11'],
...:                     'D': ['D8', 'D9', 'D10', 'D11']},
...:                     index=[8, 9, 10, 11])
...:

In [4]: frames = [df1, df2, df3]

In [5]: result = pd.concat(frames)


In [6]: result = pd.concat(frames, keys=['x', 'y', 'z'])


## 二、根据列来合并两个dataframe

### 2.1 求dataframe的交集

import pandas as pd

left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],'B': ['B0', 'B1', 'B2'],'C': ['C0', 'C2','C4']},index=['K0', 'K1', 'K2'])

right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],'D': ['D0', 'D2', 'D3']},index=['K0', 'K2', 'K3'])

print 'Left:\n'
Left:

A   B   C
K0  A0  B0  C0
K1  A1  B1  C2
K2  A2  B2  C4

print '\nRight\n'
print right
Right

C   D
K0  C0  D0
K2  C2  D2
K3  C3  D3


result_1 = pd.merge(left, right, left_index=True, right_index=True, how='inner')

print result_1

A   B C_x C_y   D
K0  A0  B0  C0  C0  D0
K2  A2  B2  C4  C2  D2


result_2 = pd.merge(left, right, on='C', how='inner')
print result_2

A   B   C   D
0  A0  B0  C0  D0
1  A1  B1  C2  D2


result_3 = pd.merge(left, right, left_on='C', right_on='C', how='inner')
print result_3

A   B   C   D
0  A0  B0  C0  D0
1  A1  B1  C2  D2


• left 以左边的dataframe为主
• right 以右边的dataframe为主
• inner 求两个dataframe的交集
• outer 求两个dataframe的并集

## 二、判断某个dataframe中的元素是否在另一个dataframe里面

df_4 = df_1[(~df_1.pdb.isin(df_2.pdb))]


## 三、用B的某一列数据去填充A的某一列

#### 方法一：（更稳妥）

merged = df_a.merge(df_b, on='A', how='outer',
suffixes=('_dfa', '_dfb'))

df_a['B'] = np.where(merged['B_dfb'].isnull(),
merged['B_dfa'],
merged['B_dfb']).astype(int)


np.where(condition, x, y) 满足条件(condition)，输出x，不满足输出y。

#### 方法二：update（更稳妥）

>>> df = pd.DataFrame({'A': [1, 2, 3],
...                    'B': [400, 500, 600]})
>>> new_df = pd.DataFrame({'B': [4, 5, 6],
...                        'C': [7, 8, 9]})
>>> df.update(new_df)
>>> df
A  B
0  1  4
1  2  5
2  3  6


>>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
...                    'B': ['x', 'y', 'z']})
>>> new_df = pd.DataFrame({'B': ['d', 'e', 'f', 'g', 'h', 'i']})
>>> df.update(new_df)
>>> df
A  B
0  a  d
1  b  e
2  c  f


>>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
...                    'B': ['x', 'y', 'z']})
>>> new_column = pd.Series(['d', 'e'], name='B', index=[0, 2])
>>> df.update(new_column)
>>> df
A  B
0  a  d
1  b  y
2  c  e
>>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
...                    'B': ['x', 'y', 'z']})
>>> new_df = pd.DataFrame({'B': ['d', 'e']}, index=[1, 2])
>>> df.update(new_df)
>>> df
A  B
0  a  x
1  b  d
2  c  e


>>> df = pd.DataFrame({'A': [1, 2, 3],
...                    'B': [400, 500, 600]})
>>> new_df = pd.DataFrame({'B': [4, np.nan, 6]})
>>> df.update(new_df)
>>> df
A      B
0  1    4.0
1  2  500.0
2  3    6.0