数据清洗
1.处理缺失数据
import pandas as pdlog_data = pd.read_csv('log.csv')
log_data
|
time
|
user
|
video
|
playback position
|
paused
|
volume
|
0
|
1469974424
|
cheryl
|
intro.html
|
5
|
False
|
10.0
|
1
|
1469974454
|
cheryl
|
intro.html
|
6
|
NaN
|
NaN
|
2
|
1469974544
|
cheryl
|
intro.html
|
9
|
NaN
|
NaN
|
3
|
1469974574
|
cheryl
|
intro.html
|
10
|
NaN
|
NaN
|
4
|
1469977514
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
5
|
1469977544
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
6
|
1469977574
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
7
|
1469977604
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
8
|
1469974604
|
cheryl
|
intro.html
|
11
|
NaN
|
NaN
|
9
|
1469974694
|
cheryl
|
intro.html
|
14
|
NaN
|
NaN
|
10
|
1469974724
|
cheryl
|
intro.html
|
15
|
NaN
|
NaN
|
11
|
1469974454
|
sue
|
advanced.html
|
24
|
NaN
|
NaN
|
12
|
1469974524
|
sue
|
advanced.html
|
25
|
NaN
|
NaN
|
13
|
1469974424
|
sue
|
advanced.html
|
23
|
False
|
10.0
|
14
|
1469974554
|
sue
|
advanced.html
|
26
|
NaN
|
NaN
|
15
|
1469974624
|
sue
|
advanced.html
|
27
|
NaN
|
NaN
|
16
|
1469974654
|
sue
|
advanced.html
|
28
|
NaN
|
5.0
|
17
|
1469974724
|
sue
|
advanced.html
|
29
|
NaN
|
NaN
|
18
|
1469974484
|
cheryl
|
intro.html
|
7
|
NaN
|
NaN
|
19
|
1469974514
|
cheryl
|
intro.html
|
8
|
NaN
|
NaN
|
20
|
1469974754
|
sue
|
advanced.html
|
30
|
NaN
|
NaN
|
21
|
1469974824
|
sue
|
advanced.html
|
31
|
NaN
|
NaN
|
22
|
1469974854
|
sue
|
advanced.html
|
32
|
NaN
|
NaN
|
23
|
1469974924
|
sue
|
advanced.html
|
33
|
NaN
|
NaN
|
24
|
1469977424
|
bob
|
intro.html
|
1
|
True
|
10.0
|
25
|
1469977454
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
26
|
1469977484
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
27
|
1469977634
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
28
|
1469977664
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
29
|
1469974634
|
cheryl
|
intro.html
|
12
|
NaN
|
NaN
|
30
|
1469974664
|
cheryl
|
intro.html
|
13
|
NaN
|
NaN
|
31
|
1469977694
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
32
|
1469977724
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
log_data.isnull()
|
time
|
user
|
video
|
playback position
|
paused
|
volume
|
0
|
False
|
False
|
False
|
False
|
False
|
False
|
1
|
False
|
False
|
False
|
False
|
True
|
True
|
2
|
False
|
False
|
False
|
False
|
True
|
True
|
3
|
False
|
False
|
False
|
False
|
True
|
True
|
4
|
False
|
False
|
False
|
False
|
True
|
True
|
5
|
False
|
False
|
False
|
False
|
True
|
True
|
6
|
False
|
False
|
False
|
False
|
True
|
True
|
7
|
False
|
False
|
False
|
False
|
True
|
True
|
8
|
False
|
False
|
False
|
False
|
True
|
True
|
9
|
False
|
False
|
False
|
False
|
True
|
True
|
10
|
False
|
False
|
False
|
False
|
True
|
True
|
11
|
False
|
False
|
False
|
False
|
True
|
True
|
12
|
False
|
False
|
False
|
False
|
True
|
True
|
13
|
False
|
False
|
False
|
False
|
False
|
False
|
14
|
False
|
False
|
False
|
False
|
True
|
True
|
15
|
False
|
False
|
False
|
False
|
True
|
True
|
16
|
False
|
False
|
False
|
False
|
True
|
False
|
17
|
False
|
False
|
False
|
False
|
True
|
True
|
18
|
False
|
False
|
False
|
False
|
True
|
True
|
19
|
False
|
False
|
False
|
False
|
True
|
True
|
20
|
False
|
False
|
False
|
False
|
True
|
True
|
21
|
False
|
False
|
False
|
False
|
True
|
True
|
22
|
False
|
False
|
False
|
False
|
True
|
True
|
23
|
False
|
False
|
False
|
False
|
True
|
True
|
24
|
False
|
False
|
False
|
False
|
False
|
False
|
25
|
False
|
False
|
False
|
False
|
True
|
True
|
26
|
False
|
False
|
False
|
False
|
True
|
True
|
27
|
False
|
False
|
False
|
False
|
True
|
True
|
28
|
False
|
False
|
False
|
False
|
True
|
True
|
29
|
False
|
False
|
False
|
False
|
True
|
True
|
30
|
False
|
False
|
False
|
False
|
True
|
True
|
31
|
False
|
False
|
False
|
False
|
True
|
True
|
32
|
False
|
False
|
False
|
False
|
True
|
True
|
log_data['paused'].isnull()
0 False
1 True
2 True
3 True
4 True
5 True
6 True
7 True
8 True
9 True
10 True
11 True
12 True
13 False
14 True
15 True
16 True
17 True
18 True
19 True
20 True
21 True
22 True
23 True
24 False
25 True
26 True
27 True
28 True
29 True
30 True
31 True
32 True
Name: paused, dtype: bool
# 取出volume不为空的数据
log_data[log_data['volume'].notnull()]
|
time
|
user
|
video
|
playback position
|
paused
|
volume
|
0
|
1469974424
|
cheryl
|
intro.html
|
5
|
False
|
10.0
|
13
|
1469974424
|
sue
|
advanced.html
|
23
|
False
|
10.0
|
16
|
1469974654
|
sue
|
advanced.html
|
28
|
NaN
|
5.0
|
24
|
1469977424
|
bob
|
intro.html
|
1
|
True
|
10.0
|
log_data.set_index(['time', 'user'], inplace=True)
log_data.sort_index(inplace=True)
log_data
|
|
video
|
playback position
|
paused
|
volume
|
time
|
user
|
|
|
|
|
1469974424
|
cheryl
|
intro.html
|
5
|
False
|
10.0
|
sue
|
advanced.html
|
23
|
False
|
10.0
|
1469974454
|
cheryl
|
intro.html
|
6
|
NaN
|
NaN
|
sue
|
advanced.html
|
24
|
NaN
|
NaN
|
1469974484
|
cheryl
|
intro.html
|
7
|
NaN
|
NaN
|
1469974514
|
cheryl
|
intro.html
|
8
|
NaN
|
NaN
|
1469974524
|
sue
|
advanced.html
|
25
|
NaN
|
NaN
|
1469974544
|
cheryl
|
intro.html
|
9
|
NaN
|
NaN
|
1469974554
|
sue
|
advanced.html
|
26
|
NaN
|
NaN
|
1469974574
|
cheryl
|
intro.html
|
10
|
NaN
|
NaN
|
1469974604
|
cheryl
|
intro.html
|
11
|
NaN
|
NaN
|
1469974624
|
sue
|
advanced.html
|
27
|
NaN
|
NaN
|
1469974634
|
cheryl
|
intro.html
|
12
|
NaN
|
NaN
|
1469974654
|
sue
|
advanced.html
|
28
|
NaN
|
5.0
|
1469974664
|
cheryl
|
intro.html
|
13
|
NaN
|
NaN
|
1469974694
|
cheryl
|
intro.html
|
14
|
NaN
|
NaN
|
1469974724
|
cheryl
|
intro.html
|
15
|
NaN
|
NaN
|
sue
|
advanced.html
|
29
|
NaN
|
NaN
|
1469974754
|
sue
|
advanced.html
|
30
|
NaN
|
NaN
|
1469974824
|
sue
|
advanced.html
|
31
|
NaN
|
NaN
|
1469974854
|
sue
|
advanced.html
|
32
|
NaN
|
NaN
|
1469974924
|
sue
|
advanced.html
|
33
|
NaN
|
NaN
|
1469977424
|
bob
|
intro.html
|
1
|
True
|
10.0
|
1469977454
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
1469977484
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
1469977514
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
1469977544
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
1469977574
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
1469977604
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
1469977634
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
1469977664
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
1469977694
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
1469977724
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
log_data.fillna(0)
|
|
video
|
playback position
|
paused
|
volume
|
time
|
user
|
|
|
|
|
1469974424
|
cheryl
|
intro.html
|
5
|
False
|
10.0
|
sue
|
advanced.html
|
23
|
False
|
10.0
|
1469974454
|
cheryl
|
intro.html
|
6
|
0
|
0.0
|
sue
|
advanced.html
|
24
|
0
|
0.0
|
1469974484
|
cheryl
|
intro.html
|
7
|
0
|
0.0
|
1469974514
|
cheryl
|
intro.html
|
8
|
0
|
0.0
|
1469974524
|
sue
|
advanced.html
|
25
|
0
|
0.0
|
1469974544
|
cheryl
|
intro.html
|
9
|
0
|
0.0
|
1469974554
|
sue
|
advanced.html
|
26
|
0
|
0.0
|
1469974574
|
cheryl
|
intro.html
|
10
|
0
|
0.0
|
1469974604
|
cheryl
|
intro.html
|
11
|
0
|
0.0
|
1469974624
|
sue
|
advanced.html
|
27
|
0
|
0.0
|
1469974634
|
cheryl
|
intro.html
|
12
|
0
|
0.0
|
1469974654
|
sue
|
advanced.html
|
28
|
0
|
5.0
|
1469974664
|
cheryl
|
intro.html
|
13
|
0
|
0.0
|
1469974694
|
cheryl
|
intro.html
|
14
|
0
|
0.0
|
1469974724
|
cheryl
|
intro.html
|
15
|
0
|
0.0
|
sue
|
advanced.html
|
29
|
0
|
0.0
|
1469974754
|
sue
|
advanced.html
|
30
|
0
|
0.0
|
1469974824
|
sue
|
advanced.html
|
31
|
0
|
0.0
|
1469974854
|
sue
|
advanced.html
|
32
|
0
|
0.0
|
1469974924
|
sue
|
advanced.html
|
33
|
0
|
0.0
|
1469977424
|
bob
|
intro.html
|
1
|
True
|
10.0
|
1469977454
|
bob
|
intro.html
|
1
|
0
|
0.0
|
1469977484
|
bob
|
intro.html
|
1
|
0
|
0.0
|
1469977514
|
bob
|
intro.html
|
1
|
0
|
0.0
|
1469977544
|
bob
|
intro.html
|
1
|
0
|
0.0
|
1469977574
|
bob
|
intro.html
|
1
|
0
|
0.0
|
1469977604
|
bob
|
intro.html
|
1
|
0
|
0.0
|
1469977634
|
bob
|
intro.html
|
1
|
0
|
0.0
|
1469977664
|
bob
|
intro.html
|
1
|
0
|
0.0
|
1469977694
|
bob
|
intro.html
|
1
|
0
|
0.0
|
1469977724
|
bob
|
intro.html
|
1
|
0
|
0.0
|
log_data.dropna(subset=['volume'])
|
|
video
|
playback position
|
paused
|
volume
|
time
|
user
|
|
|
|
|
1469974424
|
cheryl
|
intro.html
|
5
|
False
|
10.0
|
sue
|
advanced.html
|
23
|
False
|
10.0
|
1469974654
|
sue
|
advanced.html
|
28
|
NaN
|
5.0
|
1469977424
|
bob
|
intro.html
|
1
|
True
|
10.0
|
log_data
|
|
video
|
playback position
|
paused
|
volume
|
time
|
user
|
|
|
|
|
1469974424
|
cheryl
|
intro.html
|
5
|
False
|
10.0
|
sue
|
advanced.html
|
23
|
False
|
10.0
|
1469974454
|
cheryl
|
intro.html
|
6
|
NaN
|
NaN
|
sue
|
advanced.html
|
24
|
NaN
|
NaN
|
1469974484
|
cheryl
|
intro.html
|
7
|
NaN
|
NaN
|
1469974514
|
cheryl
|
intro.html
|
8
|
NaN
|
NaN
|
1469974524
|
sue
|
advanced.html
|
25
|
NaN
|
NaN
|
1469974544
|
cheryl
|
intro.html
|
9
|
NaN
|
NaN
|
1469974554
|
sue
|
advanced.html
|
26
|
NaN
|
NaN
|
1469974574
|
cheryl
|
intro.html
|
10
|
NaN
|
NaN
|
1469974604
|
cheryl
|
intro.html
|
11
|
NaN
|
NaN
|
1469974624
|
sue
|
advanced.html
|
27
|
NaN
|
NaN
|
1469974634
|
cheryl
|
intro.html
|
12
|
NaN
|
NaN
|
1469974654
|
sue
|
advanced.html
|
28
|
NaN
|
5.0
|
1469974664
|
cheryl
|
intro.html
|
13
|
NaN
|
NaN
|
1469974694
|
cheryl
|
intro.html
|
14
|
NaN
|
NaN
|
1469974724
|
cheryl
|
intro.html
|
15
|
NaN
|
NaN
|
sue
|
advanced.html
|
29
|
NaN
|
NaN
|
1469974754
|
sue
|
advanced.html
|
30
|
NaN
|
NaN
|
1469974824
|
sue
|
advanced.html
|
31
|
NaN
|
NaN
|
1469974854
|
sue
|
advanced.html
|
32
|
NaN
|
NaN
|
1469974924
|
sue
|
advanced.html
|
33
|
NaN
|
NaN
|
1469977424
|
bob
|
intro.html
|
1
|
True
|
10.0
|
1469977454
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
1469977484
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
1469977514
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
1469977544
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
1469977574
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
1469977604
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
1469977634
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
1469977664
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
1469977694
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
1469977724
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
log_data.ffill()
|
|
video
|
playback position
|
paused
|
volume
|
time
|
user
|
|
|
|
|
1469974424
|
cheryl
|
intro.html
|
5
|
False
|
10.0
|
sue
|
advanced.html
|
23
|
False
|
10.0
|
1469974454
|
cheryl
|
intro.html
|
6
|
False
|
10.0
|
sue
|
advanced.html
|
24
|
False
|
10.0
|
1469974484
|
cheryl
|
intro.html
|
7
|
False
|
10.0
|
1469974514
|
cheryl
|
intro.html
|
8
|
False
|
10.0
|
1469974524
|
sue
|
advanced.html
|
25
|
False
|
10.0
|
1469974544
|
cheryl
|
intro.html
|
9
|
False
|
10.0
|
1469974554
|
sue
|
advanced.html
|
26
|
False
|
10.0
|
1469974574
|
cheryl
|
intro.html
|
10
|
False
|
10.0
|
1469974604
|
cheryl
|
intro.html
|
11
|
False
|
10.0
|
1469974624
|
sue
|
advanced.html
|
27
|
False
|
10.0
|
1469974634
|
cheryl
|
intro.html
|
12
|
False
|
10.0
|
1469974654
|
sue
|
advanced.html
|
28
|
False
|
5.0
|
1469974664
|
cheryl
|
intro.html
|
13
|
False
|
5.0
|
1469974694
|
cheryl
|
intro.html
|
14
|
False
|
5.0
|
1469974724
|
cheryl
|
intro.html
|
15
|
False
|
5.0
|
sue
|
advanced.html
|
29
|
False
|
5.0
|
1469974754
|
sue
|
advanced.html
|
30
|
False
|
5.0
|
1469974824
|
sue
|
advanced.html
|
31
|
False
|
5.0
|
1469974854
|
sue
|
advanced.html
|
32
|
False
|
5.0
|
1469974924
|
sue
|
advanced.html
|
33
|
False
|
5.0
|
1469977424
|
bob
|
intro.html
|
1
|
True
|
10.0
|
1469977454
|
bob
|
intro.html
|
1
|
True
|
10.0
|
1469977484
|
bob
|
intro.html
|
1
|
True
|
10.0
|
1469977514
|
bob
|
intro.html
|
1
|
True
|
10.0
|
1469977544
|
bob
|
intro.html
|
1
|
True
|
10.0
|
1469977574
|
bob
|
intro.html
|
1
|
True
|
10.0
|
1469977604
|
bob
|
intro.html
|
1
|
True
|
10.0
|
1469977634
|
bob
|
intro.html
|
1
|
True
|
10.0
|
1469977664
|
bob
|
intro.html
|
1
|
True
|
10.0
|
1469977694
|
bob
|
intro.html
|
1
|
True
|
10.0
|
1469977724
|
bob
|
intro.html
|
1
|
True
|
10.0
|
log_data.bfill()
|
|
video
|
playback position
|
paused
|
volume
|
time
|
user
|
|
|
|
|
1469974424
|
cheryl
|
intro.html
|
5
|
False
|
10.0
|
sue
|
advanced.html
|
23
|
False
|
10.0
|
1469974454
|
cheryl
|
intro.html
|
6
|
True
|
5.0
|
sue
|
advanced.html
|
24
|
True
|
5.0
|
1469974484
|
cheryl
|
intro.html
|
7
|
True
|
5.0
|
1469974514
|
cheryl
|
intro.html
|
8
|
True
|
5.0
|
1469974524
|
sue
|
advanced.html
|
25
|
True
|
5.0
|
1469974544
|
cheryl
|
intro.html
|
9
|
True
|
5.0
|
1469974554
|
sue
|
advanced.html
|
26
|
True
|
5.0
|
1469974574
|
cheryl
|
intro.html
|
10
|
True
|
5.0
|
1469974604
|
cheryl
|
intro.html
|
11
|
True
|
5.0
|
1469974624
|
sue
|
advanced.html
|
27
|
True
|
5.0
|
1469974634
|
cheryl
|
intro.html
|
12
|
True
|
5.0
|
1469974654
|
sue
|
advanced.html
|
28
|
True
|
5.0
|
1469974664
|
cheryl
|
intro.html
|
13
|
True
|
10.0
|
1469974694
|
cheryl
|
intro.html
|
14
|
True
|
10.0
|
1469974724
|
cheryl
|
intro.html
|
15
|
True
|
10.0
|
sue
|
advanced.html
|
29
|
True
|
10.0
|
1469974754
|
sue
|
advanced.html
|
30
|
True
|
10.0
|
1469974824
|
sue
|
advanced.html
|
31
|
True
|
10.0
|
1469974854
|
sue
|
advanced.html
|
32
|
True
|
10.0
|
1469974924
|
sue
|
advanced.html
|
33
|
True
|
10.0
|
1469977424
|
bob
|
intro.html
|
1
|
True
|
10.0
|
1469977454
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
1469977484
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
1469977514
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
1469977544
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
1469977574
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
1469977604
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
1469977634
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
1469977664
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
1469977694
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
1469977724
|
bob
|
intro.html
|
1
|
NaN
|
NaN
|
2.数据变形
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],'k2': [1, 1, 2, 3, 3, 4, 4]})
data
|
k1
|
k2
|
0
|
one
|
1
|
1
|
two
|
1
|
2
|
one
|
2
|
3
|
two
|
3
|
4
|
one
|
3
|
5
|
two
|
4
|
6
|
two
|
4
|
# 判断数据是否重复
data.duplicated()
0 False
1 False
2 False
3 False
4 False
5 False
6 True
dtype: bool
# 去除重复数据
data.drop_duplicates()
|
k1
|
k2
|
0
|
one
|
1
|
1
|
two
|
1
|
2
|
one
|
2
|
3
|
two
|
3
|
4
|
one
|
3
|
5
|
two
|
4
|
data['v1'] = range(7)
data
|
k1
|
k2
|
v1
|
0
|
one
|
1
|
0
|
1
|
two
|
1
|
1
|
2
|
one
|
2
|
2
|
3
|
two
|
3
|
3
|
4
|
one
|
3
|
4
|
5
|
two
|
4
|
5
|
6
|
two
|
4
|
6
|
# 去除指定列的重复数据
data.drop_duplicates(['k1'])
|
k1
|
k2
|
v1
|
0
|
one
|
1
|
0
|
1
|
two
|
1
|
1
|
data.drop_duplicates(['k1', 'k2'], keep='last')
|
k1
|
k2
|
v1
|
0
|
one
|
1
|
0
|
1
|
two
|
1
|
1
|
2
|
one
|
2
|
2
|
3
|
two
|
3
|
3
|
4
|
one
|
3
|
4
|
6
|
two
|
4
|
6
|
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami', 'corned beef', 'Bacon', 'pastrami', 'honey ham', 'nova lox'],'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data
|
food
|
ounces
|
0
|
bacon
|
4.0
|
1
|
pulled pork
|
3.0
|
2
|
bacon
|
12.0
|
3
|
Pastrami
|
6.0
|
4
|
corned beef
|
7.5
|
5
|
Bacon
|
8.0
|
6
|
pastrami
|
3.0
|
7
|
honey ham
|
5.0
|
8
|
nova lox
|
6.0
|
# 添加一列,用于指定食物的来源
meat_to_animal = {'bacon': 'pig','pulled pork': 'pig','pastrami': 'cow','corned beef': 'cow','honey ham': 'pig','nova lox': 'salmon'
}
# 使用map()
lowercased = data['food'].str.lower()
data['animal'] = lowercased.map(meat_to_animal)
data
|
food
|
ounces
|
animal
|
0
|
bacon
|
4.0
|
pig
|
1
|
pulled pork
|
3.0
|
pig
|
2
|
bacon
|
12.0
|
pig
|
3
|
Pastrami
|
6.0
|
cow
|
4
|
corned beef
|
7.5
|
cow
|
5
|
Bacon
|
8.0
|
pig
|
6
|
pastrami
|
3.0
|
cow
|
7
|
honey ham
|
5.0
|
pig
|
8
|
nova lox
|
6.0
|
salmon
|
# 使用方法
data['animal2'] = data['food'].map(lambda x: meat_to_animal[x.lower()])
data
|
food
|
ounces
|
animal
|
animal2
|
0
|
bacon
|
4.0
|
pig
|
pig
|
1
|
pulled pork
|
3.0
|
pig
|
pig
|
2
|
bacon
|
12.0
|
pig
|
pig
|
3
|
Pastrami
|
6.0
|
cow
|
cow
|
4
|
corned beef
|
7.5
|
cow
|
cow
|
5
|
Bacon
|
8.0
|
pig
|
pig
|
6
|
pastrami
|
3.0
|
cow
|
cow
|
7
|
honey ham
|
5.0
|
pig
|
pig
|
8
|
nova lox
|
6.0
|
salmon
|
salmon
|
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data
0 1.0
1 -999.0
2 2.0
3 -999.0
4 -1000.0
5 3.0
dtype: float64
import numpy as np# 将-999替换为空值
data.replace(-999, np.nan)
0 1.0
1 NaN
2 2.0
3 NaN
4 -1000.0
5 3.0
dtype: float64
# 将-999,-1000都替换为空值
data.replace([-999, -1000], np.nan)
0 1.0
1 NaN
2 2.0
3 NaN
4 NaN
5 3.0
dtype: float64
# 将-999,-1000分别替换为空值和0
data.replace([-999, -1000], [np.nan, 0])
0 1.0
1 NaN
2 2.0
3 NaN
4 0.0
5 3.0
dtype: float64
data.replace({-999: np.nan, -1000: 0})
0 1.0
1 NaN
2 2.0
3 NaN
4 0.0
5 3.0
dtype: float64
import pandas as pd
# 年龄数据
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]# 分箱的边界
bins = [18, 25, 35, 60, 100]
cats = pd.cut(ages, bins)
print(type(cats))
<class 'pandas.core.arrays.categorical.Categorical'>
# Categorical对象
cats
[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]
# 获取分箱编码
cats.codes
array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)
# 返回分箱边界索引
cats.categories
IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]]closed='right',dtype='interval[int64]')
# 统计箱中元素的个数
pd.value_counts(cats)
(18, 25] 5
(35, 60] 3
(25, 35] 3
(60, 100] 1
dtype: int64
# 带标签的分箱
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
cats = pd.cut(ages, bins, labels=group_names)
cats.get_values()
array(['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', 'Youth','MiddleAged', 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged','YoungAdult'], dtype=object)
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'], 'data1': range(6)})
df
|
key
|
data1
|
0
|
b
|
0
|
1
|
b
|
1
|
2
|
a
|
2
|
3
|
c
|
3
|
4
|
a
|
4
|
5
|
b
|
5
|
pd.get_dummies(df['key'])
|
a
|
b
|
c
|
0
|
0
|
1
|
0
|
1
|
0
|
1
|
0
|
2
|
1
|
0
|
0
|
3
|
0
|
0
|
1
|
4
|
1
|
0
|
0
|
5
|
0
|
1
|
0
|
data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com', 'Rob': 'rob@gmail.com', 'Wes': np.nan}
data = pd.Series(data)
data
Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Wes NaN
dtype: object
data.str.contains('gmail')
Dave False
Steve True
Rob True
Wes NaN
dtype: object
data.str[:5]
Dave dave@
Steve steve
Rob rob@g
Wes NaN
dtype: object
split_df = data.str.split('@', expand=True)
split_df
|
0
|
1
|
Dave
|
dave
|
google.com
|
Steve
|
steve
|
gmail.com
|
Rob
|
rob
|
gmail.com
|
Wes
|
NaN
|
NaN
|
split_df[0].str.cat(split_df[1], sep='@')
Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Wes NaN
Name: 0, dtype: object
lect02_codes02_pandas相关推荐
最新文章
- 1、Cocos2dx 3.0游戏开发三找一小块前言
- 语音识别基础,总有一天你会用到
- UML建模—EA创建Class(类图)
- Build path contains duplicate entry
- 创建用户的种类与区分
- spring boot 注解
- 招聘贴---这个很重要嘛
- Identify ksusetxn DID:An Deadlock ID
- Print Conductor中文版
- Micro USB 引脚定义及OTG (USB-HOST) 接线
- 图解医学影像纹理特征
- 计算机网络双绞线和交叉线的区别,交叉线-什么是交叉线、直通线和双绞线?各有什么区别 – 手机爱问...
- Mac M1安装ta-lib
- Windows如何安装WSL(中途退出后遇到bug该怎么办)
- Netty实战:Springboot+Netty+protobuf开发高性能服务器 (附源码下载)
- 让女人无法抗拒的30句表白
- 数据结构算法Day01
- 主题 02:如何设计系统预案(Preplan)?
- 西门子PCS7系统NTP网络时钟同步(NTP时钟发生器)介绍
- ArcGIS 切片缓存紧凑文件格式分析与使用
热门文章
- 【PTA】【C语言(一)】
- 计算机组成原理课程设计(很全面有保障)
- f7功能键使用计算机,【ThinkPad】F1-F12功能键最全使用指南
- Hutool工具类之excel导入导出
- 计算机新建表格2,(Word表格的制作计算机基础2.doc
- 离散控制 discrete control
- 42-Map集合遍历键找值方式-键值对对象Entry-键值对方式遍历
- print list Reversely
- 小程序setData执行后,页面没有刷新
- sftp命令上传本地文件到服务器,使用sftp命令上传文件夹方法