pandas 用法 (2)
pandas 讀寫資料檔案
首先準備一個 student.csv 檔
裡面資料長這樣
Student ID,name ,age,gender
1100,Kelly,22,Female
1101,Clo,21,Female
1102,Tilly,22,Female
1103,Tony,24,Male
1104,David,20,Male
1105,Catty,22,Female
1106,M,3,Female
1107,N,43,Male
1108,A,13,Male
1109,S,12,Male
1110,David,33,Male
1111,Dw,3,Female
1112,Q,23,Male
1113,W,21,Female
import pandas as pd
import numpy as np
data = pd.read_csv('student.csv')
print(data)
'''
Student ID name age gender
0 1100 Kelly 22 Female
1 1101 Clo 21 Female
2 1102 Tilly 22 Female
3 1103 Tony 24 Male
4 1104 David 20 Male
5 1105 Catty 22 Female
6 1106 M 3 Female
7 1107 N 43 Male
8 1108 A 13 Male
9 1109 S 12 Male
10 1110 David 33 Male
11 1111 Dw 3 Female
12 1112 Q 23 Male
13 1113 W 21 Female
'''
data.to_pickle('student.pickle')
pandas concatenating 合併資料
- concat 設定 axis=0 為直向合併
import pandas as pd
import numpy as np
df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'])
df3 = pd.DataFrame(np.ones((3,4))*2, columns=['a','b','c','d'])
print(df1)
'''
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
'''
print(df2)
'''
a b c d
0 1.0 1.0 1.0 1.0
1 1.0 1.0 1.0 1.0
2 1.0 1.0 1.0 1.0
'''
print(df3)
'''
a b c d
0 2.0 2.0 2.0 2.0
1 2.0 2.0 2.0 2.0
2 2.0 2.0 2.0 2.0
'''
res = pd.concat([df1,df2,df3],axis=0)
print(res)
'''
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
0 1.0 1.0 1.0 1.0
1 1.0 1.0 1.0 1.0
2 1.0 1.0 1.0 1.0
0 2.0 2.0 2.0 2.0
1 2.0 2.0 2.0 2.0
2 2.0 2.0 2.0 2.0
'''
- ignore_index = True 可以忽略合併時舊的 index 欄位,改採用自動產生的 index
res = pd.concat([df1,df2,df3],axis=0, ignore_index=True)
print(res)
'''
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0
5 1.0 1.0 1.0 1.0
6 2.0 2.0 2.0 2.0
7 2.0 2.0 2.0 2.0
8 2.0 2.0 2.0 2.0
'''
- concat 的 join 屬性有兩種模式 inner, outer(預設)
import pandas as pd
import numpy as np
df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'],index=[1,2,3])
df2 = pd.DataFrame(np.ones((3,4))*0, columns=['b','c','d','e'],index=[2,3,4])
print(df1)
'''
a b c d
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0
'''
print(df2)
'''
b c d e
2 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0
4 0.0 0.0 0.0 0.0
'''
res = pd.concat([df1,df2])
res = pd.concat([df1,df2], join='outer')
print(res)
'''
a b c d e
1 0.0 0.0 0.0 0.0 NaN
2 0.0 0.0 0.0 0.0 NaN
3 0.0 0.0 0.0 0.0 NaN
2 NaN 0.0 0.0 0.0 0.0
3 NaN 0.0 0.0 0.0 0.0
4 NaN 0.0 0.0 0.0 0.0
'''
res = pd.concat([df1,df2], join='inner', ignore_index=True)
print(res)
'''
b c d
0 0.0 0.0 0.0
1 0.0 0.0 0.0
2 0.0 0.0 0.0
3 0.0 0.0 0.0
4 0.0 0.0 0.0
5 0.0 0.0 0.0
'''
- concat 的 join_axes 功能,用於水瓶合併時指定 index 參考者
import pandas as pd
import numpy as np
df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'],index=[1,2,3])
df2 = pd.DataFrame(np.ones((3,4))*0, columns=['b','c','d','e'],index=[2,3,4])
res = pd.concat([df1,df2],axis=1, join_axes=[df1.index])
print(res)
'''
a b c d b c d e
1 0.0 0.0 0.0 0.0 NaN NaN NaN NaN
2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
'''
- 使用 DataFrame append 來合併資料,新增資料
import pandas as pd
import numpy as np
df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'])
res = df1.append(df2, ignore_index=True)
print(res)
'''
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0
5 1.0 1.0 1.0 1.0
'''
df3 = pd.DataFrame(np.ones((3,4))*3, columns=['a','b','c','d'])
res = df1.append([df2,df3], ignore_index=True)
print(res)
'''
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0
5 1.0 1.0 1.0 1.0
6 3.0 3.0 3.0 3.0
7 3.0 3.0 3.0 3.0
8 3.0 3.0 3.0 3.0
'''
s1 = pd.Series([1,2,3,4],index=['a','b','c','d'])
res = df1.append(s1, ignore_index=True)
print(res)
'''
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 1.0 2.0 3.0 4.0
'''
pandas merge 合併資料
merge by 一個 key
import pandas as pd
import numpy as np
left = pd.DataFrame({
'key':['K0','K1','K2','K3'],
'A':['A0','A1','A2','A3'],
'B':['B0','B1','B2','B3']})
right = pd.DataFrame({
'key':['K0','K1','K2','K3'],
'C':['C0','C1','C2','C3'],
'D':['D0','D1','D2','D3']})
print(left)
'''
A B key
0 A0 B0 K0
1 A1 B1 K1
2 A2 B2 K2
3 A3 B3 K3
'''
print(right)
'''
C D key
0 C0 D0 K0
1 C1 D1 K1
2 C2 D2 K2
3 C3 D3 K3
'''
res = pd.merge(left,right, on='key')
print(res)
'''
A B key C D
0 A0 B0 K0 C0 D0
1 A1 B1 K1 C1 D1
2 A2 B2 K2 C2 D2
3 A3 B3 K3 C3 D3
'''
merge by 多個 key
import pandas as pd
import numpy as np
left = pd.DataFrame({
'key1':['K0','K0','K1','K2'],
'key2':['K0','K1','K0','K1'],
'A':['A0','A1','A2','A3'],
'B':['B0','B1','B2','B3']})
right = pd.DataFrame({
'key1':['K0','K1','K1','K2'],
'key2':['K0','K0','K0','K0'],
'C':['C0','C1','C2','C3'],
'D':['D0','D1','D2','D3']})
print(left)
'''
A B key1 key2
0 A0 B0 K0 K0
1 A1 B1 K0 K1
2 A2 B2 K1 K0
3 A3 B3 K2 K1
'''
print(right)
'''
C D key1 key2
0 C0 D0 K0 K0
1 C1 D1 K1 K0
2 C2 D2 K1 K0
3 C3 D3 K2 K0
'''
res = pd.merge(left,right, on=['key1','key2'])
res = pd.merge(left,right, on=['key1','key2'],how='inner')
print(res)
''' 合併後,他預設只把相同的部分留下來 (inner 模式)
A B key1 key2 C D
0 A0 B0 K0 K0 C0 D0
1 A2 B2 K1 K0 C1 D1
2 A2 B2 K1 K0 C2 D2
'''
res = pd.merge(left,right, on=['key1','key2'],how='outer')
print(res)
'''
A B key1 key2 C D
0 A0 B0 K0 K0 C0 D0
1 A1 B1 K0 K1 NaN NaN
2 A2 B2 K1 K0 C1 D1
3 A2 B2 K1 K0 C2 D2
4 A3 B3 K2 K1 NaN NaN
5 NaN NaN K2 K0 C3 D3
'''
res = pd.merge(left,right, on=['key1','key2'],how='right')
print(res)
'''
A B key1 key2 C D
0 A0 B0 K0 K0 C0 D0
1 A2 B2 K1 K0 C1 D1
2 A2 B2 K1 K0 C2 D2
3 NaN NaN K2 K0 C3 D3
'''
res = pd.merge(left,right, on=['key1','key2'],how='left')
print(res)
'''
A B key1 key2 C D
0 A0 B0 K0 K0 C0 D0
1 A1 B1 K0 K1 NaN NaN
2 A2 B2 K1 K0 C1 D1
3 A2 B2 K1 K0 C2 D2
4 A3 B3 K2 K1 NaN NaN
'''
使用 indicator 顯示 merge 的 mode
import pandas as pd
import numpy as np
df1 = pd.DataFrame({'col1':[0,1], 'col_left':['a','b']})
df2 = pd.DataFrame({'col1':[1,2,3], 'col_right':[2,2,2]})
print(df1)
'''
col1 col_left
0 0 a
1 1 b
'''
print(df2)
'''
col1 col_right
0 1 2
1 2 2
2 3 2
'''
res = pd.merge(df1,df2, on='col1', how='outer',indicator=True)
print(res)
'''
col1 col_left col_right _merge
0 0 a NaN left_only
1 1 b 2.0 both
2 2 NaN 2.0 right_only
3 3 NaN 2.0 right_only
'''
設定 indicator 欄位的名字
res = pd.merge(df1,df2, on='col1', how='outer',indicator='indicator_column')
print(res)
'''
col1 col_left col_right indicator_column
0 0 a NaN left_only
1 1 b 2.0 both
2 2 NaN 2.0 right_only
3 3 NaN 2.0 right_only
'''
merge by index
import pandas as pd
import numpy as np
left = pd.DataFrame({
'A':['A0','A1','A2'],
'B':['B0','B1','B2']},
index=['K0','K1','K2'])
right = pd.DataFrame({
'C':['C0','C2','C3'],
'D':['D0','D2','D3']},
index=['K0','K2','K3'])
print(left)
'''
A B
K0 A0 B0
K1 A1 B1
K2 A2 B2
'''
print(right)
'''
C D
K0 C0 D0
K2 C2 D2
K3 C3 D3
'''
res = pd.merge(left,right, left_index=True, right_index=True, how='outer')
print(res)
'''
A B C D
K0 A0 B0 C0 D0
K1 A1 B1 NaN NaN
K2 A2 B2 C2 D2
K3 NaN NaN C3 D3
'''
merge 合併時,處理欄位名稱相同衝突,以 suffixes 區別
import pandas as pd
import numpy as np
boys = pd.DataFrame({'k':['K0','K1','K2'],'age':[1,2,3]})
girls = pd.DataFrame({'k':['K0','K0','K3'],'age':[4,5,6]})
print(boys)
'''
age k
0 1 K0
1 2 K1
2 3 K2
'''
print(girls)
'''
age k
0 4 K0
1 5 K0
2 6 K3
'''
res = pd.merge(boys,girls, on='k', suffixes=['_boy','_girl'], how='outer')
print(res)
'''
age_boy k age_girl
0 1.0 K0 4.0
1 1.0 K0 5.0
2 2.0 K1 NaN
3 3.0 K2 NaN
4 NaN K3 6.0
'''
pandas plot
- plot 基本用法 畫出 Series 數據
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
data = pd.Series(np.random.randn(1000), index=np.arange(1000))
data = data.cumsum()
data.plot()
plt.show()
- plot 畫出 DataFrame 四個數據
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
data = pd.DataFrame(np.random.randn(1000,4),
index=np.arange(1000),
columns=list("ABCD"))
data = data.cumsum()
print(data.head(5))
'''
A B C D
0 -0.056981 -0.167990 -0.103564 -0.807399
1 1.008049 -1.633926 0.959755 0.405345
2 0.821038 -3.090023 2.821623 -0.880397
3 0.448243 -4.889474 4.477471 -1.378809
4 -1.453623 -3.347546 5.371346 -2.983690
'''
data.plot()
plt.show()
- scatter 用法
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
data = pd.DataFrame(np.random.randn(1000,4),
index=np.arange(1000),
columns=list("ABCD"))
data = data.cumsum()
print(data.head(5))
'''
A B C D
0 1.149112 -1.189742 -1.108183 -1.276239
1 0.889289 -0.979980 -0.821403 0.726542
2 1.219525 -0.753984 0.279848 1.686624
3 1.006253 -0.191323 0.595033 0.578449
4 0.710900 -0.820767 0.064716 1.539593
'''
ax = data.plot.scatter(x='A',y='B', color='Red',label='Class 1')
data.plot.scatter(x='A', y='C', color='Green', label='Class 2', ax=ax)
plt.show()
I like .
回覆刪除Great!
回覆刪除