DataFrame查增改删
一、查 Read
(1)类list/ndarray数据访问方式
import pandas as pd
dates = pd.date_range('20130101',periods=10)
dates
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
'2013-01-05', '2013-01-06', '2013-01-07', '2013-01-08',
'2013-01-09', '2013-01-10'],
dtype='datetime64[ns]', freq='D')
import numpy as np
df = pd.DataFrame(np.random.randn(10,4),index=dates,columns=['A','B','C','D'])
df.head()
|
A |
B |
C |
D |
2013-01-01 |
-0.094887 |
-1.334026 |
-0.191152 |
-0.161571 |
2013-01-02 |
0.699451 |
1.864515 |
-0.826253 |
0.630770 |
2013-01-03 |
-0.706805 |
0.829414 |
-1.758934 |
-1.512252 |
2013-01-04 |
-0.159045 |
0.130182 |
1.089689 |
0.024567 |
2013-01-05 |
-1.971381 |
0.848481 |
0.823581 |
0.541739 |
df['A'].head()
2013-01-01 -0.094887
2013-01-02 0.699451
2013-01-03 -0.706805
2013-01-04 -0.159045
2013-01-05 -1.971381
Freq: D, Name: A, dtype: float64
df.A.head()
2013-01-01 -0.094887
2013-01-02 0.699451
2013-01-03 -0.706805
2013-01-04 -0.159045
2013-01-05 -1.971381
Freq: D, Name: A, dtype: float64
df['A']['2013-01-01']
0.75407705661157032
df.A['2013-01-01']
0.75407705661157032
df[['A','C']].head()
|
A |
C |
2013-01-01 |
-0.094887 |
-0.191152 |
2013-01-02 |
0.699451 |
-0.826253 |
2013-01-03 |
-0.706805 |
-1.758934 |
2013-01-04 |
-0.159045 |
1.089689 |
2013-01-05 |
-1.971381 |
0.823581 |
(2)Pandas专用的数据访问方式
pandas的索引函数主要有三种:
loc 标签索引,行和列的名称
iloc 整型索引(绝对位置索引),绝对意义上的几行几列,起始索引为0
ix 是 iloc 和 loc的合体
at是loc的快捷方式
iat是iloc的快捷方式
(2.1).loc
通过自定义索引获取数据
df.loc['2013-01-01']
A -0.094887
B -1.334026
C -0.191152
D -0.161571
Name: 2013-01-01 00:00:00, dtype: float64
df.loc[:,'A'].head()
2013-01-01 -0.094887
2013-01-02 0.699451
2013-01-03 -0.706805
2013-01-04 -0.159045
2013-01-05 -1.971381
Freq: D, Name: A, dtype: float64
df.loc['2013-01-01','A']
0.75407705661157032
df.loc[[dates[0],dates[2]],:]
|
A |
B |
C |
D |
2013-01-01 |
0.754077 |
-0.346202 |
-0.557050 |
0.778106 |
2013-01-03 |
0.174730 |
2.056007 |
1.781379 |
1.643397 |
df.loc[:,['A','B']].head()
|
A |
B |
2013-01-01 |
-0.094887 |
-1.334026 |
2013-01-02 |
0.699451 |
1.864515 |
2013-01-03 |
-0.706805 |
0.829414 |
2013-01-04 |
-0.159045 |
0.130182 |
2013-01-05 |
-1.971381 |
0.848481 |
df.loc[[dates[0],dates[2]],['A','B']]
|
A |
B |
2013-01-01 |
0.754077 |
-0.346202 |
2013-01-03 |
0.174730 |
2.056007 |
df.loc['2013-01-01':'2013-01-04',:]
|
A |
B |
C |
D |
2013-01-01 |
0.754077 |
-0.346202 |
-0.557050 |
0.778106 |
2013-01-02 |
0.103394 |
-1.051044 |
-0.413054 |
0.268955 |
2013-01-03 |
0.174730 |
2.056007 |
1.781379 |
1.643397 |
2013-01-04 |
-0.950517 |
-0.226887 |
-0.097138 |
-0.442010 |
df.loc[:,'A':'C'].head()
|
A |
B |
C |
2013-01-01 |
-0.094887 |
-1.334026 |
-0.191152 |
2013-01-02 |
0.699451 |
1.864515 |
-0.826253 |
2013-01-03 |
-0.706805 |
0.829414 |
-1.758934 |
2013-01-04 |
-0.159045 |
0.130182 |
1.089689 |
2013-01-05 |
-1.971381 |
0.848481 |
0.823581 |
df.loc['2013-01-01':'2013-01-04','A':'C']
|
A |
B |
C |
2013-01-01 |
0.754077 |
-0.346202 |
-0.557050 |
2013-01-02 |
0.103394 |
-1.051044 |
-0.413054 |
2013-01-03 |
0.174730 |
2.056007 |
1.781379 |
2013-01-04 |
-0.950517 |
-0.226887 |
-0.097138 |
(2.2).iloc 通过默认索引获取数据
df.iloc[3]
A -0.950517
B -0.226887
C -0.097138
D -0.442010
Name: 2013-01-04 00:00:00, dtype: float64
df.iloc[:,2].head()
2013-01-01 -0.191152
2013-01-02 -0.826253
2013-01-03 -1.758934
2013-01-04 1.089689
2013-01-05 0.823581
Freq: D, Name: C, dtype: float64
df.iloc[1,2]
-0.41305425875508139
df.iloc[[1,2,4],:]
|
A |
B |
C |
D |
2013-01-02 |
0.103394 |
-1.051044 |
-0.413054 |
0.268955 |
2013-01-03 |
0.174730 |
2.056007 |
1.781379 |
1.643397 |
2013-01-05 |
0.076178 |
-0.518970 |
1.142290 |
-0.952401 |
df.iloc[:,[0,2]].head()
|
A |
C |
2013-01-01 |
-0.094887 |
-0.191152 |
2013-01-02 |
0.699451 |
-0.826253 |
2013-01-03 |
-0.706805 |
-1.758934 |
2013-01-04 |
-0.159045 |
1.089689 |
2013-01-05 |
-1.971381 |
0.823581 |
df.iloc[[1,2,4],[0,2]]
|
A |
C |
2013-01-02 |
0.103394 |
-0.413054 |
2013-01-03 |
0.174730 |
1.781379 |
2013-01-05 |
0.076178 |
1.142290 |
df.iloc[1:3,:]
|
A |
B |
C |
D |
2013-01-02 |
0.103394 |
-1.051044 |
-0.413054 |
0.268955 |
2013-01-03 |
0.174730 |
2.056007 |
1.781379 |
1.643397 |
df.iloc[:,1:3].head()
|
B |
C |
2013-01-01 |
-1.334026 |
-0.191152 |
2013-01-02 |
1.864515 |
-0.826253 |
2013-01-03 |
0.829414 |
-1.758934 |
2013-01-04 |
0.130182 |
1.089689 |
2013-01-05 |
0.848481 |
0.823581 |
df.iloc[3:5,0:2]
|
A |
B |
2013-01-04 |
-0.950517 |
-0.226887 |
2013-01-05 |
0.076178 |
-0.518970 |
(2.3)Boolean索引
df[df.A > 0]
|
A |
B |
C |
D |
2013-01-02 |
0.699451 |
1.864515 |
-0.826253 |
0.630770 |
2013-01-07 |
0.117383 |
0.103409 |
-1.039062 |
1.929632 |
2013-01-10 |
1.109493 |
-0.708581 |
0.284196 |
0.938002 |
b = df[df > 0]
b.head()
|
A |
B |
C |
D |
2013-01-01 |
NaN |
NaN |
NaN |
NaN |
2013-01-02 |
0.699451 |
1.864515 |
NaN |
0.630770 |
2013-01-03 |
NaN |
0.829414 |
NaN |
NaN |
2013-01-04 |
NaN |
0.130182 |
1.089689 |
0.024567 |
2013-01-05 |
NaN |
0.848481 |
0.823581 |
0.541739 |
type(b['A']['2013-01-01'])
numpy.float64
df2 = df.copy()
df2['E'] = ['one', 'one','two','three','four','three','five','four','three','five']
df2.head()
|
A |
B |
C |
D |
E |
2013-01-01 |
-0.094887 |
-1.334026 |
-0.191152 |
-0.161571 |
one |
2013-01-02 |
0.699451 |
1.864515 |
-0.826253 |
0.630770 |
one |
2013-01-03 |
-0.706805 |
0.829414 |
-1.758934 |
-1.512252 |
two |
2013-01-04 |
-0.159045 |
0.130182 |
1.089689 |
0.024567 |
three |
2013-01-05 |
-1.971381 |
0.848481 |
0.823581 |
0.541739 |
four |
df2['E'].isin(['one','four']).head()
2013-01-01 True
2013-01-02 True
2013-01-03 False
2013-01-04 False
2013-01-05 True
Freq: D, Name: E, dtype: bool
df2[df2['E'].isin(['one','four'])]
|
A |
B |
C |
D |
E |
2013-01-01 |
0.754077 |
-0.346202 |
-0.557050 |
0.778106 |
one |
2013-01-02 |
0.103394 |
-1.051044 |
-0.413054 |
0.268955 |
one |
2013-01-05 |
0.076178 |
-0.518970 |
1.142290 |
-0.952401 |
four |
2013-01-08 |
-1.246918 |
1.530266 |
1.761499 |
0.940741 |
four |
二、增 Create
s1 = pd.Series([1,2,3,4,5,6],
index=pd.date_range('20130102', periods=6))
s1
2013-01-02 1
2013-01-03 2
2013-01-04 3
2013-01-05 4
2013-01-06 5
2013-01-07 6
Freq: D, dtype: int64
df2['F'] = s1
df2.head()
|
A |
B |
C |
D |
E |
F |
2013-01-01 |
-0.094887 |
-1.334026 |
-0.191152 |
-0.161571 |
one |
NaN |
2013-01-02 |
0.699451 |
1.864515 |
-0.826253 |
0.630770 |
one |
1.0 |
2013-01-03 |
-0.706805 |
0.829414 |
-1.758934 |
-1.512252 |
two |
2.0 |
2013-01-04 |
-0.159045 |
0.130182 |
1.089689 |
0.024567 |
three |
3.0 |
2013-01-05 |
-1.971381 |
0.848481 |
0.823581 |
0.541739 |
four |
4.0 |
三、改 Update
df2.loc[:,'D'].head()
2013-01-01 -0.161571
2013-01-02 0.630770
2013-01-03 -1.512252
2013-01-04 0.024567
2013-01-05 0.541739
Freq: D, Name: D, dtype: float64
df2.loc[:,'D'] = 5
df2.head()
|
A |
B |
C |
D |
E |
F |
2013-01-01 |
-0.094887 |
-1.334026 |
-0.191152 |
5 |
one |
NaN |
2013-01-02 |
0.699451 |
1.864515 |
-0.826253 |
5 |
one |
1.0 |
2013-01-03 |
-0.706805 |
0.829414 |
-1.758934 |
5 |
two |
2.0 |
2013-01-04 |
-0.159045 |
0.130182 |
1.089689 |
5 |
three |
3.0 |
2013-01-05 |
-1.971381 |
0.848481 |
0.823581 |
5 |
four |
4.0 |
df2.iloc[1,3]
5
df2.iloc[1,3] = 10.1
df2.head()
|
A |
B |
C |
D |
E |
F |
2013-01-01 |
-0.094887 |
-1.334026 |
-0.191152 |
5.0 |
one |
NaN |
2013-01-02 |
0.699451 |
1.864515 |
-0.826253 |
10.1 |
one |
1.0 |
2013-01-03 |
-0.706805 |
0.829414 |
-1.758934 |
5.0 |
two |
2.0 |
2013-01-04 |
-0.159045 |
0.130182 |
1.089689 |
5.0 |
three |
3.0 |
2013-01-05 |
-1.971381 |
0.848481 |
0.823581 |
5.0 |
four |
4.0 |
df3 = df.copy()
df3[df3 > 0] = -df3
df3.head()
|
A |
B |
C |
D |
2013-01-01 |
-0.094887 |
-1.334026 |
-0.191152 |
-0.161571 |
2013-01-02 |
-0.699451 |
-1.864515 |
-0.826253 |
-0.630770 |
2013-01-03 |
-0.706805 |
-0.829414 |
-1.758934 |
-1.512252 |
2013-01-04 |
-0.159045 |
-0.130182 |
-1.089689 |
-0.024567 |
2013-01-05 |
-1.971381 |
-0.848481 |
-0.823581 |
-0.541739 |
四、删 drop
(1)删除行
import pandas as pd
import numpy as np
df1 = pd.DataFrame(np.arange(1, 17).reshape(4, 4),
index = ['北京', '上海', '广州', '深圳'],
columns = ['2015', '2016', '2017', '2018'])
df1
|
2015 |
2016 |
2017 |
2018 |
北京 |
1 |
2 |
3 |
4 |
上海 |
5 |
6 |
7 |
8 |
广州 |
9 |
10 |
11 |
12 |
深圳 |
13 |
14 |
15 |
16 |
df2 = df1.drop(['北京'])
df2
|
2015 |
2016 |
2017 |
2018 |
上海 |
5 |
6 |
7 |
8 |
广州 |
9 |
10 |
11 |
12 |
深圳 |
13 |
14 |
15 |
16 |
df1.drop(['北京', '上海'], axis = 0)
|
2015 |
2016 |
2017 |
2018 |
广州 |
9 |
10 |
11 |
12 |
深圳 |
13 |
14 |
15 |
16 |
df3 = df1.drop(['北京', '上海'], axis = 0)
df3
|
2015 |
2016 |
2017 |
2018 |
广州 |
9 |
10 |
11 |
12 |
深圳 |
13 |
14 |
15 |
16 |
(2)删除列
1)直接del DF[‘column-name’]
2)采用drop方法,有下面三种等价的表达式:
DF= DF.drop(‘column_name’, 1);
DF.drop(‘column_name’,axis=1, inplace=True)
DF.drop([DF.columns[[0,1,]]], axis=1,inplace=True)
del df1['2015']
df1
|
2016 |
2017 |
2018 |
北京 |
2 |
3 |
4 |
上海 |
6 |
7 |
8 |
广州 |
10 |
11 |
12 |
深圳 |
14 |
15 |
16 |
df1.drop(['2016'], axis = 1)
|
2017 |
2018 |
北京 |
3 |
4 |
上海 |
7 |
8 |
广州 |
11 |
12 |
深圳 |
15 |
16 |
df1
|
2016 |
2017 |
2018 |
北京 |
2 |
3 |
4 |
上海 |
6 |
7 |
8 |
广州 |
10 |
11 |
12 |
深圳 |
14 |
15 |
16 |