import pandas as pd
import numpy as np
pd.__version__
'1.0.3'
文件的读写#
csv格式
data=pd.read_csv('E:\jupyter Notebook\天池比赛\joyful-pandas-master\data\\table.csv')
data.head(20)
|
School
|
Class
|
ID
|
Gender
|
Address
|
Height
|
Weight
|
Math
|
Physics
|
0
|
S_1
|
C_1
|
1101
|
M
|
street_1
|
173
|
63
|
34.0
|
A+
|
1
|
S_1
|
C_1
|
1102
|
F
|
street_2
|
192
|
73
|
32.5
|
B+
|
2
|
S_1
|
C_1
|
1103
|
M
|
street_2
|
186
|
82
|
87.2
|
B+
|
3
|
S_1
|
C_1
|
1104
|
F
|
street_2
|
167
|
81
|
80.4
|
B-
|
4
|
S_1
|
C_1
|
1105
|
F
|
street_4
|
159
|
64
|
84.8
|
B+
|
5
|
S_1
|
C_2
|
1201
|
M
|
street_5
|
188
|
68
|
97.0
|
A-
|
6
|
S_1
|
C_2
|
1202
|
F
|
street_4
|
176
|
94
|
63.5
|
B-
|
7
|
S_1
|
C_2
|
1203
|
M
|
street_6
|
160
|
53
|
58.8
|
A+
|
8
|
S_1
|
C_2
|
1204
|
F
|
street_5
|
162
|
63
|
33.8
|
B
|
9
|
S_1
|
C_2
|
1205
|
F
|
street_6
|
167
|
63
|
68.4
|
B-
|
10
|
S_1
|
C_3
|
1301
|
M
|
street_4
|
161
|
68
|
31.5
|
B+
|
11
|
S_1
|
C_3
|
1302
|
F
|
street_1
|
175
|
57
|
87.7
|
A-
|
12
|
S_1
|
C_3
|
1303
|
M
|
street_7
|
188
|
82
|
49.7
|
B
|
13
|
S_1
|
C_3
|
1304
|
M
|
street_2
|
195
|
70
|
85.2
|
A
|
14
|
S_1
|
C_3
|
1305
|
F
|
street_5
|
187
|
69
|
61.7
|
B-
|
15
|
S_2
|
C_1
|
2101
|
M
|
street_7
|
174
|
84
|
83.3
|
C
|
16
|
S_2
|
C_1
|
2102
|
F
|
street_6
|
161
|
61
|
50.6
|
B+
|
17
|
S_2
|
C_1
|
2103
|
M
|
street_4
|
157
|
61
|
52.5
|
B-
|
18
|
S_2
|
C_1
|
2104
|
F
|
street_5
|
159
|
97
|
72.2
|
B+
|
19
|
S_2
|
C_1
|
2105
|
M
|
street_4
|
170
|
81
|
34.2
|
A
|
txt格式
data_txt=pd.read_table('E:\jupyter Notebook\天池比赛\joyful-pandas-master\data\\table.txt')
#读取文件的时候,默认是以,作为分隔符,可以指定其他分隔符
data_txt.head(20)
|
col1
|
col2
|
col3
|
col4
|
0
|
2
|
a
|
1.4
|
apple
|
1
|
3
|
b
|
3.4
|
banana
|
2
|
6
|
c
|
2.5
|
orange
|
3
|
5
|
d
|
3.2
|
lemon
|
xls或者xlsx格式
data_excel=pd.read_excel('E:\jupyter Notebook\天池比赛\joyful-pandas-master\data\\table.xlsx')
data_excel.head(20)
|
School
|
Class
|
ID
|
Gender
|
Address
|
Height
|
Weight
|
Math
|
Physics
|
0
|
S_1
|
C_1
|
1101
|
M
|
street_1
|
173
|
63
|
34.0
|
A+
|
1
|
S_1
|
C_1
|
1102
|
F
|
street_2
|
192
|
73
|
32.5
|
B+
|
2
|
S_1
|
C_1
|
1103
|
M
|
street_2
|
186
|
82
|
87.2
|
B+
|
3
|
S_1
|
C_1
|
1104
|
F
|
street_2
|
167
|
81
|
80.4
|
B-
|
4
|
S_1
|
C_1
|
1105
|
F
|
street_4
|
159
|
64
|
84.8
|
B+
|
5
|
S_1
|
C_2
|
1201
|
M
|
street_5
|
188
|
68
|
97.0
|
A-
|
6
|
S_1
|
C_2
|
1202
|
F
|
street_4
|
176
|
94
|
63.5
|
B-
|
7
|
S_1
|
C_2
|
1203
|
M
|
street_6
|
160
|
53
|
58.8
|
A+
|
8
|
S_1
|
C_2
|
1204
|
F
|
street_5
|
162
|
63
|
33.8
|
B
|
9
|
S_1
|
C_2
|
1205
|
F
|
street_6
|
167
|
63
|
68.4
|
B-
|
10
|
S_1
|
C_3
|
1301
|
M
|
street_4
|
161
|
68
|
31.5
|
B+
|
11
|
S_1
|
C_3
|
1302
|
F
|
street_1
|
175
|
57
|
87.7
|
A-
|
12
|
S_1
|
C_3
|
1303
|
M
|
street_7
|
188
|
82
|
49.7
|
B
|
13
|
S_1
|
C_3
|
1304
|
M
|
street_2
|
195
|
70
|
85.2
|
A
|
14
|
S_1
|
C_3
|
1305
|
F
|
street_5
|
187
|
69
|
61.7
|
B-
|
15
|
S_2
|
C_1
|
2101
|
M
|
street_7
|
174
|
84
|
83.3
|
C
|
16
|
S_2
|
C_1
|
2102
|
F
|
street_6
|
161
|
61
|
50.6
|
B+
|
17
|
S_2
|
C_1
|
2103
|
M
|
street_4
|
157
|
61
|
52.5
|
B-
|
18
|
S_2
|
C_1
|
2104
|
F
|
street_5
|
159
|
97
|
72.2
|
B+
|
19
|
S_2
|
C_1
|
2105
|
M
|
street_4
|
170
|
81
|
34.2
|
A
|
写入
csv格式
data.to_csv('E:\jupyter Notebook\天池比赛\joyful-pandas-master\data\\table.csv')
其他的不过多展示了
数据的基本结构
Series
创建一个Series
- 对于一个Series,其中最常用的属性为值(values),索引(index),名字(name),类型(type)
s=pd.Series(np.random.randn(5),index=['a','b','c','d','e'],name='this is Series',dtype='float64')
s
a -1.576202
b 1.070319
c 0.668490
d 1.760701
e 0.942655
Name: this is Series, dtype: float64
访问Series属性
s.values
array([-1.57620183, 1.07031892, 0.66849049, 1.76070095, 0.94265474])
都会就不过多敲代码了
取出元素
#根据索引来取出
s['a']
-1.5762018254215895
调用方法
print([attr for attr in dir(s) if not attr.startswith('_')])
#字段名称打印?
['T', 'a', 'abs', 'add', 'add_prefix', 'add_suffix', 'agg', 'aggregate', 'align', 'all', 'any', 'append', 'apply', 'argmax', 'argmin', 'argsort', 'array', 'asfreq', 'asof', 'astype', 'at', 'at_time', 'attrs', 'autocorr', 'axes', 'b', 'between', 'between_time', 'bfill', 'bool', 'c', 'clip', 'combine', 'combine_first', 'convert_dtypes', 'copy', 'corr', 'count', 'cov', 'cummax', 'cummin', 'cumprod', 'cumsum', 'd', 'describe', 'diff', 'div', 'divide', 'divmod', 'dot', 'drop', 'drop_duplicates', 'droplevel', 'dropna', 'dtype', 'dtypes', 'duplicated', 'e', 'empty', 'eq', 'equals', 'ewm', 'expanding', 'explode', 'factorize', 'ffill', 'fillna', 'filter', 'first', 'first_valid_index', 'floordiv', 'ge', 'get', 'groupby', 'gt', 'hasnans', 'head', 'hist', 'iat', 'idxmax', 'idxmin', 'iloc', 'index', 'infer_objects', 'interpolate', 'is_monotonic', 'is_monotonic_decreasing', 'is_monotonic_increasing', 'is_unique', 'isin', 'isna', 'isnull', 'item', 'items', 'iteritems', 'keys', 'kurt', 'kurtosis', 'last', 'last_valid_index', 'le', 'loc', 'lt', 'mad', 'map', 'mask', 'max', 'mean', 'median', 'memory_usage', 'min', 'mod', 'mode', 'mul', 'multiply', 'name', 'nbytes', 'ndim', 'ne', 'nlargest', 'notna', 'notnull', 'nsmallest', 'nunique', 'pct_change', 'pipe', 'plot', 'pop', 'pow', 'prod', 'product', 'quantile', 'radd', 'rank', 'ravel', 'rdiv', 'rdivmod', 'reindex', 'reindex_like', 'rename', 'rename_axis', 'reorder_levels', 'repeat', 'replace', 'resample', 'reset_index', 'rfloordiv', 'rmod', 'rmul', 'rolling', 'round', 'rpow', 'rsub', 'rtruediv', 'sample', 'searchsorted', 'sem', 'set_axis', 'shape', 'shift', 'size', 'skew', 'slice_shift', 'sort_index', 'sort_values', 'squeeze', 'std', 'sub', 'subtract', 'sum', 'swapaxes', 'swaplevel', 'tail', 'take', 'to_clipboard', 'to_csv', 'to_dict', 'to_excel', 'to_frame', 'to_hdf', 'to_json', 'to_latex', 'to_list', 'to_markdown', 'to_numpy', 'to_period', 'to_pickle', 'to_sql', 'to_string', 'to_timestamp', 'to_xarray', 'transform', 'transpose', 'truediv', 'truncate', 'tshift', 'tz_convert', 'tz_localize', 'unique', 'unstack', 'update', 'value_counts', 'values', 'var', 'view', 'where', 'xs']
DataFrame
创建一个DF
df=pd.DataFrame({'col1':list('abcde'),'col2':range(5,10),'col3':[1.3,2.5,3.6,4.6,5.8]},index=list('12345'))
df
|
col1
|
col2
|
col3
|
1
|
a
|
5
|
1.3
|
2
|
b
|
6
|
2.5
|
3
|
c
|
7
|
3.6
|
4
|
d
|
8
|
4.6
|
5
|
e
|
9
|
5.8
|
从DF中取出一列为Series
df['col1']
1 a
2 b
3 c
4 d
5 e
Name: col1, dtype: object
type(df)
pandas.core.frame.DataFrame
type(df['col1'])
pandas.core.series.Series
索引对齐
df1=pd.DataFrame({"A":[1,2,3]},index=[1,2,3])
df2=pd.DataFrame({'A':[1,2,3]},index=[3,1,2])
df1-df2
#因为存在索引对齐,所以结果不是0
列的删除与添加
df.drop(index='5',columns='col1')
|
col2
|
col3
|
1
|
5
|
1.3
|
2
|
6
|
2.5
|
3
|
7
|
3.6
|
4
|
8
|
4.6
|
#pop方法直接在DF上操作,且返回被删除的列,与Python中的pop函数类似
df['col1']=[1,2,3,4,5]
df.pop('col1')
1 1
2 2
3 3
4 4
5 5
Name: col1, dtype: int64
df
|
col2
|
col3
|
1
|
5
|
1.3
|
2
|
6
|
2.5
|
3
|
7
|
3.6
|
4
|
8
|
4.6
|
5
|
9
|
5.8
|
#可以直接增加,也可通过assign可以增加新列
df1['B']=list('abc')
df1
df1.assign(C=pd.Series(list('def')))
#assign方法不会对DF进行修改,所以要重新给予新的变量
|
A
|
B
|
C
|
1
|
1
|
a
|
e
|
2
|
2
|
b
|
f
|
3
|
3
|
c
|
NaN
|
根据不同类型选择列
df.select_dtypes(include=['number']).head()
|
col2
|
col3
|
1
|
5
|
1.3
|
2
|
6
|
2.5
|
3
|
7
|
3.6
|
4
|
8
|
4.6
|
5
|
9
|
5.8
|
将series转换成DF
s=df.mean()
s.name='to_DF'
s
col2 7.00
col3 3.56
Name: to_DF, dtype: float64
s.to_frame()
|
to_DF
|
col2
|
7.00
|
col3
|
3.56
|
#使用T进行转置
s.to_frame().T
常用的基本函数
df=pd.read_csv('E:\jupyter Notebook\天池比赛\joyful-pandas-master\data\\table.csv')
df=df.drop(columns='Unnamed: 0')
df
|
School
|
Class
|
ID
|
Gender
|
Address
|
Height
|
Weight
|
Math
|
Physics
|
0
|
S_1
|
C_1
|
1101
|
M
|
street_1
|
173
|
63
|
34.0
|
A+
|
1
|
S_1
|
C_1
|
1102
|
F
|
street_2
|
192
|
73
|
32.5
|
B+
|
2
|
S_1
|
C_1
|
1103
|
M
|
street_2
|
186
|
82
|
87.2
|
B+
|
3
|
S_1
|
C_1
|
1104
|
F
|
street_2
|
167
|
81
|
80.4
|
B-
|
4
|
S_1
|
C_1
|
1105
|
F
|
street_4
|
159
|
64
|
84.8
|
B+
|
5
|
S_1
|
C_2
|
1201
|
M
|
street_5
|
188
|
68
|
97.0
|
A-
|
6
|
S_1
|
C_2
|
1202
|
F
|
street_4
|
176
|
94
|
63.5
|
B-
|
7
|
S_1
|
C_2
|
1203
|
M
|
street_6
|
160
|
53
|
58.8
|
A+
|
8
|
S_1
|
C_2
|
1204
|
F
|
street_5
|
162
|
63
|
33.8
|
B
|
9
|
S_1
|
C_2
|
1205
|
F
|
street_6
|
167
|
63
|
68.4
|
B-
|
10
|
S_1
|
C_3
|
1301
|
M
|
street_4
|
161
|
68
|
31.5
|
B+
|
11
|
S_1
|
C_3
|
1302
|
F
|
street_1
|
175
|
57
|
87.7
|
A-
|
12
|
S_1
|
C_3
|
1303
|
M
|
street_7
|
188
|
82
|
49.7
|
B
|
13
|
S_1
|
C_3
|
1304
|
M
|
street_2
|
195
|
70
|
85.2
|
A
|
14
|
S_1
|
C_3
|
1305
|
F
|
street_5
|
187
|
69
|
61.7
|
B-
|
15
|
S_2
|
C_1
|
2101
|
M
|
street_7
|
174
|
84
|
83.3
|
C
|
16
|
S_2
|
C_1
|
2102
|
F
|
street_6
|
161
|
61
|
50.6
|
B+
|
17
|
S_2
|
C_1
|
2103
|
M
|
street_4
|
157
|
61
|
52.5
|
B-
|
18
|
S_2
|
C_1
|
2104
|
F
|
street_5
|
159
|
97
|
72.2
|
B+
|
19
|
S_2
|
C_1
|
2105
|
M
|
street_4
|
170
|
81
|
34.2
|
A
|
20
|
S_2
|
C_2
|
2201
|
M
|
street_5
|
193
|
100
|
39.1
|
B
|
21
|
S_2
|
C_2
|
2202
|
F
|
street_7
|
194
|
77
|
68.5
|
B+
|
22
|
S_2
|
C_2
|
2203
|
M
|
street_4
|
155
|
91
|
73.8
|
A+
|
23
|
S_2
|
C_2
|
2204
|
M
|
street_1
|
175
|
74
|
47.2
|
B-
|
24
|
S_2
|
C_2
|
2205
|
F
|
street_7
|
183
|
76
|
85.4
|
B
|
25
|
S_2
|
C_3
|
2301
|
F
|
street_4
|
157
|
78
|
72.3
|
B+
|
26
|
S_2
|
C_3
|
2302
|
M
|
street_5
|
171
|
88
|
32.7
|
A
|
27
|
S_2
|
C_3
|
2303
|
F
|
street_7
|
190
|
99
|
65.9
|
C
|
28
|
S_2
|
C_3
|
2304
|
F
|
street_6
|
164
|
81
|
95.5
|
A-
|
29
|
S_2
|
C_3
|
2305
|
M
|
street_4
|
187
|
73
|
48.9
|
B
|
30
|
S_2
|
C_4
|
2401
|
F
|
street_2
|
192
|
62
|
45.3
|
A
|
31
|
S_2
|
C_4
|
2402
|
M
|
street_7
|
166
|
82
|
48.7
|
B
|
32
|
S_2
|
C_4
|
2403
|
F
|
street_6
|
158
|
60
|
59.7
|
B+
|
33
|
S_2
|
C_4
|
2404
|
F
|
street_2
|
160
|
84
|
67.7
|
B
|
34
|
S_2
|
C_4
|
2405
|
F
|
street_6
|
193
|
54
|
47.6
|
B
|
unique和nunique
nunique显示有多少个唯一值 —数量
df['Physics'].nunique()
7
unique显示所有的唯一值 —具体值
df['Physics'].unique()
array(['A+', 'B+', 'B-', 'A-', 'B', 'A', 'C'], dtype=object)
count and value_counts
df['Physics'].count()
#注意,返回的是非缺失值的元素个数!
35
#value_counts返回每个元素有多少个
df['Physics'].value_counts()
B+ 9
B 8
B- 6
A 4
A+ 3
A- 3
C 2
Name: Physics, dtype: int64
describe and info
#info函数范围有哪些列,有多少非缺失值,每列的类型
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35 entries, 0 to 34
Data columns (total 9 columns):# Column Non-Null Count Dtype
--- ------ -------------- ----- 0 School 35 non-null object 1 Class 35 non-null object 2 ID 35 non-null int64 3 Gender 35 non-null object 4 Address 35 non-null object 5 Height 35 non-null int64 6 Weight 35 non-null int64 7 Math 35 non-null float648 Physics 35 non-null object
dtypes: float64(1), int64(3), object(5)
memory usage: 2.6+ KB
#describe默认统计数值型数据的各个统计量
df.describe()
|
ID
|
Height
|
Weight
|
Math
|
count
|
35.00000
|
35.000000
|
35.000000
|
35.000000
|
mean
|
1803.00000
|
174.142857
|
74.657143
|
61.351429
|
std
|
536.87741
|
13.541098
|
12.895377
|
19.915164
|
min
|
1101.00000
|
155.000000
|
53.000000
|
31.500000
|
25%
|
1204.50000
|
161.000000
|
63.000000
|
47.400000
|
50%
|
2103.00000
|
173.000000
|
74.000000
|
61.700000
|
75%
|
2301.50000
|
187.500000
|
82.000000
|
77.100000
|
max
|
2405.00000
|
195.000000
|
100.000000
|
97.000000
|
#describe函数可以调整不同的分位数
df.describe(percentiles=[.05,.25,.75,.95])#注意是[]
|
ID
|
Height
|
Weight
|
Math
|
count
|
35.00000
|
35.000000
|
35.000000
|
35.000000
|
mean
|
1803.00000
|
174.142857
|
74.657143
|
61.351429
|
std
|
536.87741
|
13.541098
|
12.895377
|
19.915164
|
min
|
1101.00000
|
155.000000
|
53.000000
|
31.500000
|
5%
|
1102.70000
|
157.000000
|
56.100000
|
32.640000
|
25%
|
1204.50000
|
161.000000
|
63.000000
|
47.400000
|
50%
|
2103.00000
|
173.000000
|
74.000000
|
61.700000
|
75%
|
2301.50000
|
187.500000
|
82.000000
|
77.100000
|
95%
|
2403.30000
|
193.300000
|
97.600000
|
90.040000
|
max
|
2405.00000
|
195.000000
|
100.000000
|
97.000000
|
#对于非数值型也可以使用describe函数
df['Physics'].describe()
count 35
unique 7
top B+
freq 9
Name: Physics, dtype: object
idxmax and nlargest
# idxmax函数返回最大值,在某些情况下使用,同理类比idxmin,
df['Math'].idxmax()
df['Math'].idxmin()
10
#nlargest函数范围前几个大的元素值,nsmallest功能类似
df['Math'].nlargest()#()可以填写数量
5 97.0
28 95.5
11 87.7
2 87.2
24 85.4
Name: Math, dtype: float64
clip and repalce
#clip是对超过或低于某些数进行截断
df['Math'].clip(33,80).head(30)
0 34.0
1 33.0
2 80.0
3 80.0
4 80.0
5 80.0
6 63.5
7 58.8
8 33.8
9 68.4
10 33.0
11 80.0
12 49.7
13 80.0
14 61.7
15 80.0
16 50.6
17 52.5
18 72.2
19 34.2
20 39.1
21 68.5
22 73.8
23 47.2
24 80.0
25 72.3
26 33.0
27 65.9
28 80.0
29 48.9
Name: Math, dtype: float64
#repalce是对某些值进行替换
df['Address'].head()
0 street_1
1 street_2
2 street_2
3 street_2
4 street_4
Name: Address, dtype: object
df['Address'].repalce(['street_1','street_2'],['one','two']).head()
---------------------------------------------------------------------------AttributeError Traceback (most recent call last)<ipython-input-97-4b99be6262e7> in <module>
----> 1 df['Address'].repalce(['street_1','street_2'],['one','two']).head()C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\generic.py in __getattr__(self, name)5272 if self._info_axis._can_hold_identifiers_and_holds_name(name):5273 return self[name]
-> 5274 return object.__getattribute__(self, name)5275 5276 def __setattr__(self, name: str, value) -> None:AttributeError: 'Series' object has no attribute 'repalce'
apply函数
#对于series,它可以迭代每一列值操作
df['Math'].apply(lambda x:str(x)+'!').head()
0 34.0!
1 32.5!
2 87.2!
3 80.4!
4 84.8!
Name: Math, dtype: object
#对于DF,迭代每一列操作
df.apply(lambda x:x.apply(lambda x:str(x)+'!')).head()
|
School
|
Class
|
ID
|
Gender
|
Address
|
Height
|
Weight
|
Math
|
Physics
|
0
|
S_1!
|
C_1!
|
1101!
|
M!
|
street_1!
|
173!
|
63!
|
34.0!
|
A+!
|
1
|
S_1!
|
C_1!
|
1102!
|
F!
|
street_2!
|
192!
|
73!
|
32.5!
|
B+!
|
2
|
S_1!
|
C_1!
|
1103!
|
M!
|
street_2!
|
186!
|
82!
|
87.2!
|
B+!
|
3
|
S_1!
|
C_1!
|
1104!
|
F!
|
street_2!
|
167!
|
81!
|
80.4!
|
B-!
|
4
|
S_1!
|
C_1!
|
1105!
|
F!
|
street_4!
|
159!
|
64!
|
84.8!
|
B+!
|
##对于apply的扩展-可以选择某些列单独加
df['Math'].apply(lambda x:str(x)+"!")
0 34.0!
1 32.5!
2 87.2!
3 80.4!
4 84.8!
5 97.0!
6 63.5!
7 58.8!
8 33.8!
9 68.4!
10 31.5!
11 87.7!
12 49.7!
13 85.2!
14 61.7!
15 83.3!
16 50.6!
17 52.5!
18 72.2!
19 34.2!
20 39.1!
21 68.5!
22 73.8!
23 47.2!
24 85.4!
25 72.3!
26 32.7!
27 65.9!
28 95.5!
29 48.9!
30 45.3!
31 48.7!
32 59.7!
33 67.7!
34 47.6!
Name: Math, dtype: object
排序
索引排序
df.set_index('Math').head(20)
#将Math设为索引
|
School
|
Class
|
ID
|
Gender
|
Address
|
Height
|
Weight
|
Physics
|
Math
|
|
|
|
|
|
|
|
|
34.0
|
S_1
|
C_1
|
1101
|
M
|
street_1
|
173
|
63
|
A+
|
32.5
|
S_1
|
C_1
|
1102
|
F
|
street_2
|
192
|
73
|
B+
|
87.2
|
S_1
|
C_1
|
1103
|
M
|
street_2
|
186
|
82
|
B+
|
80.4
|
S_1
|
C_1
|
1104
|
F
|
street_2
|
167
|
81
|
B-
|
84.8
|
S_1
|
C_1
|
1105
|
F
|
street_4
|
159
|
64
|
B+
|
97.0
|
S_1
|
C_2
|
1201
|
M
|
street_5
|
188
|
68
|
A-
|
63.5
|
S_1
|
C_2
|
1202
|
F
|
street_4
|
176
|
94
|
B-
|
58.8
|
S_1
|
C_2
|
1203
|
M
|
street_6
|
160
|
53
|
A+
|
33.8
|
S_1
|
C_2
|
1204
|
F
|
street_5
|
162
|
63
|
B
|
68.4
|
S_1
|
C_2
|
1205
|
F
|
street_6
|
167
|
63
|
B-
|
31.5
|
S_1
|
C_3
|
1301
|
M
|
street_4
|
161
|
68
|
B+
|
87.7
|
S_1
|
C_3
|
1302
|
F
|
street_1
|
175
|
57
|
A-
|
49.7
|
S_1
|
C_3
|
1303
|
M
|
street_7
|
188
|
82
|
B
|
85.2
|
S_1
|
C_3
|
1304
|
M
|
street_2
|
195
|
70
|
A
|
61.7
|
S_1
|
C_3
|
1305
|
F
|
street_5
|
187
|
69
|
B-
|
83.3
|
S_2
|
C_1
|
2101
|
M
|
street_7
|
174
|
84
|
C
|
50.6
|
S_2
|
C_1
|
2102
|
F
|
street_6
|
161
|
61
|
B+
|
52.5
|
S_2
|
C_1
|
2103
|
M
|
street_4
|
157
|
61
|
B-
|
72.2
|
S_2
|
C_1
|
2104
|
F
|
street_5
|
159
|
97
|
B+
|
34.2
|
S_2
|
C_1
|
2105
|
M
|
street_4
|
170
|
81
|
A
|
df.set_index('Math').sort_index().head()
|
School
|
Class
|
ID
|
Gender
|
Address
|
Height
|
Weight
|
Physics
|
Math
|
|
|
|
|
|
|
|
|
31.5
|
S_1
|
C_3
|
1301
|
M
|
street_4
|
161
|
68
|
B+
|
32.5
|
S_1
|
C_1
|
1102
|
F
|
street_2
|
192
|
73
|
B+
|
32.7
|
S_2
|
C_3
|
2302
|
M
|
street_5
|
171
|
88
|
A
|
33.8
|
S_1
|
C_2
|
1204
|
F
|
street_5
|
162
|
63
|
B
|
34.0
|
S_1
|
C_1
|
1101
|
M
|
street_1
|
173
|
63
|
A+
|
值排序
df.sort_values(by='Class').head(20)
|
School
|
Class
|
ID
|
Gender
|
Address
|
Height
|
Weight
|
Math
|
Physics
|
0
|
S_1
|
C_1
|
1101
|
M
|
street_1
|
173
|
63
|
34.0
|
A+
|
19
|
S_2
|
C_1
|
2105
|
M
|
street_4
|
170
|
81
|
34.2
|
A
|
18
|
S_2
|
C_1
|
2104
|
F
|
street_5
|
159
|
97
|
72.2
|
B+
|
16
|
S_2
|
C_1
|
2102
|
F
|
street_6
|
161
|
61
|
50.6
|
B+
|
15
|
S_2
|
C_1
|
2101
|
M
|
street_7
|
174
|
84
|
83.3
|
C
|
17
|
S_2
|
C_1
|
2103
|
M
|
street_4
|
157
|
61
|
52.5
|
B-
|
1
|
S_1
|
C_1
|
1102
|
F
|
street_2
|
192
|
73
|
32.5
|
B+
|
2
|
S_1
|
C_1
|
1103
|
M
|
street_2
|
186
|
82
|
87.2
|
B+
|
3
|
S_1
|
C_1
|
1104
|
F
|
street_2
|
167
|
81
|
80.4
|
B-
|
4
|
S_1
|
C_1
|
1105
|
F
|
street_4
|
159
|
64
|
84.8
|
B+
|
6
|
S_1
|
C_2
|
1202
|
F
|
street_4
|
176
|
94
|
63.5
|
B-
|
24
|
S_2
|
C_2
|
2205
|
F
|
street_7
|
183
|
76
|
85.4
|
B
|
23
|
S_2
|
C_2
|
2204
|
M
|
street_1
|
175
|
74
|
47.2
|
B-
|
22
|
S_2
|
C_2
|
2203
|
M
|
street_4
|
155
|
91
|
73.8
|
A+
|
21
|
S_2
|
C_2
|
2202
|
F
|
street_7
|
194
|
77
|
68.5
|
B+
|
5
|
S_1
|
C_2
|
1201
|
M
|
street_5
|
188
|
68
|
97.0
|
A-
|
20
|
S_2
|
C_2
|
2201
|
M
|
street_5
|
193
|
100
|
39.1
|
B
|
9
|
S_1
|
C_2
|
1205
|
F
|
street_6
|
167
|
63
|
68.4
|
B-
|
8
|
S_1
|
C_2
|
1204
|
F
|
street_5
|
162
|
63
|
33.8
|
B
|
7
|
S_1
|
C_2
|
1203
|
M
|
street_6
|
160
|
53
|
58.8
|
A+
|
多个值排序,即先对第一层排,在第一层相同的情况下,对第二层排序
df.sort_values(by=['Math','Physics']).head(20)
|
School
|
Class
|
ID
|
Gender
|
Address
|
Height
|
Weight
|
Math
|
Physics
|
10
|
S_1
|
C_3
|
1301
|
M
|
street_4
|
161
|
68
|
31.5
|
B+
|
1
|
S_1
|
C_1
|
1102
|
F
|
street_2
|
192
|
73
|
32.5
|
B+
|
26
|
S_2
|
C_3
|
2302
|
M
|
street_5
|
171
|
88
|
32.7
|
A
|
8
|
S_1
|
C_2
|
1204
|
F
|
street_5
|
162
|
63
|
33.8
|
B
|
0
|
S_1
|
C_1
|
1101
|
M
|
street_1
|
173
|
63
|
34.0
|
A+
|
19
|
S_2
|
C_1
|
2105
|
M
|
street_4
|
170
|
81
|
34.2
|
A
|
20
|
S_2
|
C_2
|
2201
|
M
|
street_5
|
193
|
100
|
39.1
|
B
|
30
|
S_2
|
C_4
|
2401
|
F
|
street_2
|
192
|
62
|
45.3
|
A
|
23
|
S_2
|
C_2
|
2204
|
M
|
street_1
|
175
|
74
|
47.2
|
B-
|
34
|
S_2
|
C_4
|
2405
|
F
|
street_6
|
193
|
54
|
47.6
|
B
|
31
|
S_2
|
C_4
|
2402
|
M
|
street_7
|
166
|
82
|
48.7
|
B
|
29
|
S_2
|
C_3
|
2305
|
M
|
street_4
|
187
|
73
|
48.9
|
B
|
12
|
S_1
|
C_3
|
1303
|
M
|
street_7
|
188
|
82
|
49.7
|
B
|
16
|
S_2
|
C_1
|
2102
|
F
|
street_6
|
161
|
61
|
50.6
|
B+
|
17
|
S_2
|
C_1
|
2103
|
M
|
street_4
|
157
|
61
|
52.5
|
B-
|
7
|
S_1
|
C_2
|
1203
|
M
|
street_6
|
160
|
53
|
58.8
|
A+
|
32
|
S_2
|
C_4
|
2403
|
F
|
street_6
|
158
|
60
|
59.7
|
B+
|
14
|
S_1
|
C_3
|
1305
|
F
|
street_5
|
187
|
69
|
61.7
|
B-
|
6
|
S_1
|
C_2
|
1202
|
F
|
street_4
|
176
|
94
|
63.5
|
B-
|
27
|
S_2
|
C_3
|
2303
|
F
|
street_7
|
190
|
99
|
65.9
|
C
|
问题与练习
问题
Series和DataFrame有哪些常见属性和方法?
- for Series have values,name,type and index
- for DF index and values
value_counts会统计缺失值吗?
与idxmax和nlargest功能相反的是哪两组函数?
df.mean(axis=1)是什么意思?它与df.mean()的结果一样吗?第一问提到的函数也有axis参数吗?怎么使用?
- 针对DF进行行均值计算 不一样,前者是行,后者是列
练习
现有一份关于美剧《权力的游戏》剧本的数据集,请解决以下问题:
- 在所有的数据中,一共出现了多少人物?
- 以单元格计数(即简单把一个单元格视作一句),谁说了最多的话?
- 以单词计数,谁说了最多的单词?
data1=pd.read_csv('E:\jupyter Notebook\天池比赛\joyful-pandas-master\data\\Game_of_Thrones_Script.csv')
data1.head(20)
|
Release Date
|
Season
|
Episode
|
Episode Title
|
Name
|
Sentence
|
0
|
2011/4/17
|
Season 1
|
Episode 1
|
Winter is Coming
|
waymar royce
|
What do you expect? They're savages. One lot s...
|
1
|
2011/4/17
|
Season 1
|
Episode 1
|
Winter is Coming
|
will
|
I've never seen wildlings do a thing like this...
|
2
|
2011/4/17
|
Season 1
|
Episode 1
|
Winter is Coming
|
waymar royce
|
How close did you get?
|
3
|
2011/4/17
|
Season 1
|
Episode 1
|
Winter is Coming
|
will
|
Close as any man would.
|
4
|
2011/4/17
|
Season 1
|
Episode 1
|
Winter is Coming
|
gared
|
We should head back to the wall.
|
5
|
2011/4/17
|
Season 1
|
Episode 1
|
Winter is Coming
|
royce
|
Do the dead frighten you?
|
6
|
2011/4/17
|
Season 1
|
Episode 1
|
Winter is Coming
|
gared
|
Our orders were to track the wildlings. We tra...
|
7
|
2011/4/17
|
Season 1
|
Episode 1
|
Winter is Coming
|
royce
|
You don't think he'll ask us how they died? Ge...
|
8
|
2011/4/17
|
Season 1
|
Episode 1
|
Winter is Coming
|
will
|
Whatever did it to them could do it to us. The...
|
9
|
2011/4/17
|
Season 1
|
Episode 1
|
Winter is Coming
|
royce
|
It's a good thing we're not children. You want...
|
10
|
2011/4/17
|
Season 1
|
Episode 1
|
Winter is Coming
|
royce
|
Your dead men seem to have moved camp.
|
11
|
2011/4/17
|
Season 1
|
Episode 1
|
Winter is Coming
|
will
|
They were here.
|
12
|
2011/4/17
|
Season 1
|
Episode 1
|
Winter is Coming
|
gared
|
See where they went.
|
13
|
2011/4/17
|
Season 1
|
Episode 1
|
Winter is Coming
|
royce
|
What is it?
|
14
|
2011/4/17
|
Season 1
|
Episode 1
|
Winter is Coming
|
gared
|
It's …
|
15
|
2011/4/17
|
Season 1
|
Episode 1
|
Winter is Coming
|
jon snow
|
Go on. Father's watching.
|
16
|
2011/4/17
|
Season 1
|
Episode 1
|
Winter is Coming
|
jon snow
|
And your mother.
|
17
|
2011/4/17
|
Season 1
|
Episode 1
|
Winter is Coming
|
septa mordane
|
Fine work, as always. Well done.
|
18
|
2011/4/17
|
Season 1
|
Episode 1
|
Winter is Coming
|
sansa stark
|
Thank you.
|
19
|
2011/4/17
|
Season 1
|
Episode 1
|
Winter is Coming
|
septa mordane
|
I love the detail that you've managed to get i...
|
data1.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23911 entries, 0 to 23910
Data columns (total 6 columns):# Column Non-Null Count Dtype
--- ------ -------------- ----- 0 Release Date 23911 non-null object1 Season 23911 non-null object2 Episode 23911 non-null object3 Episode Title 23911 non-null object4 Name 23911 non-null object5 Sentence 23911 non-null object
dtypes: object(6)
memory usage: 1.1+ MB
data1['Name'].nunique()
#一共有564个人物
564
#针对单元格,使用describe,结果是tyrion lannister,次数是1760
data1['Name'].describe()
count 23911
unique 564
top tyrion lannister
freq 1760
Name: Name, dtype: object
第三个题目不会
现有一份关于科比的投篮数据集,请解决如下问题:
- 哪种action_type和combined_shot_type的组合是最多的?
- 在所有被记录的game_id中,遭遇到最多的opponent是一个支?
data2=pd.read_csv('E:\jupyter Notebook\天池比赛\joyful-pandas-master\data\\Kobe_data.csv')
data2
|
shot_id
|
action_type
|
combined_shot_type
|
game_event_id
|
game_id
|
lat
|
loc_x
|
loc_y
|
lon
|
minutes_remaining
|
...
|
shot_made_flag
|
shot_type
|
shot_zone_area
|
shot_zone_basic
|
shot_zone_range
|
team_id
|
team_name
|
game_date
|
matchup
|
opponent
|
0
|
1
|
Jump Shot
|
Jump Shot
|
10
|
20000012
|
33.9723
|
167
|
72
|
-118.1028
|
10
|
...
|
NaN
|
2PT Field Goal
|
Right Side(R)
|
Mid-Range
|
16-24 ft.
|
1610612747
|
Los Angeles Lakers
|
2000/10/31
|
LAL @ POR
|
POR
|
1
|
2
|
Jump Shot
|
Jump Shot
|
12
|
20000012
|
34.0443
|
-157
|
0
|
-118.4268
|
10
|
...
|
0.0
|
2PT Field Goal
|
Left Side(L)
|
Mid-Range
|
8-16 ft.
|
1610612747
|
Los Angeles Lakers
|
2000/10/31
|
LAL @ POR
|
POR
|
2
|
3
|
Jump Shot
|
Jump Shot
|
35
|
20000012
|
33.9093
|
-101
|
135
|
-118.3708
|
7
|
...
|
1.0
|
2PT Field Goal
|
Left Side Center(LC)
|
Mid-Range
|
16-24 ft.
|
1610612747
|
Los Angeles Lakers
|
2000/10/31
|
LAL @ POR
|
POR
|
3
|
4
|
Jump Shot
|
Jump Shot
|
43
|
20000012
|
33.8693
|
138
|
175
|
-118.1318
|
6
|
...
|
0.0
|
2PT Field Goal
|
Right Side Center(RC)
|
Mid-Range
|
16-24 ft.
|
1610612747
|
Los Angeles Lakers
|
2000/10/31
|
LAL @ POR
|
POR
|
4
|
5
|
Driving Dunk Shot
|
Dunk
|
155
|
20000012
|
34.0443
|
0
|
0
|
-118.2698
|
6
|
...
|
1.0
|
2PT Field Goal
|
Center(C)
|
Restricted Area
|
Less Than 8 ft.
|
1610612747
|
Los Angeles Lakers
|
2000/10/31
|
LAL @ POR
|
POR
|
...
|
...
|
...
|
...
|
...
|
...
|
...
|
...
|
...
|
...
|
...
|
...
|
...
|
...
|
...
|
...
|
...
|
...
|
...
|
...
|
...
|
...
|
30692
|
30693
|
Jump Shot
|
Jump Shot
|
397
|
49900088
|
33.9963
|
1
|
48
|
-118.2688
|
6
|
...
|
0.0
|
2PT Field Goal
|
Center(C)
|
In The Paint (Non-RA)
|
Less Than 8 ft.
|
1610612747
|
Los Angeles Lakers
|
2000/6/19
|
LAL vs. IND
|
IND
|
30693
|
30694
|
Tip Shot
|
Tip Shot
|
398
|
49900088
|
34.0443
|
0
|
0
|
-118.2698
|
6
|
...
|
NaN
|
2PT Field Goal
|
Center(C)
|
Restricted Area
|
Less Than 8 ft.
|
1610612747
|
Los Angeles Lakers
|
2000/6/19
|
LAL vs. IND
|
IND
|
30694
|
30695
|
Running Jump Shot
|
Jump Shot
|
426
|
49900088
|
33.8783
|
-134
|
166
|
-118.4038
|
3
|
...
|
1.0
|
2PT Field Goal
|
Left Side Center(LC)
|
Mid-Range
|
16-24 ft.
|
1610612747
|
Los Angeles Lakers
|
2000/6/19
|
LAL vs. IND
|
IND
|
30695
|
30696
|
Jump Shot
|
Jump Shot
|
448
|
49900088
|
33.7773
|
31
|
267
|
-118.2388
|
2
|
...
|
0.0
|
3PT Field Goal
|
Center(C)
|
Above the Break 3
|
24+ ft.
|
1610612747
|
Los Angeles Lakers
|
2000/6/19
|
LAL vs. IND
|
IND
|
30696
|
30697
|
Jump Shot
|
Jump Shot
|
471
|
49900088
|
33.9723
|
1
|
72
|
-118.2688
|
0
|
...
|
0.0
|
2PT Field Goal
|
Center(C)
|
In The Paint (Non-RA)
|
Less Than 8 ft.
|
1610612747
|
Los Angeles Lakers
|
2000/6/19
|
LAL vs. IND
|
IND
|
30697 rows × 25 columns
data2=data2.set_index('shot_id')
---------------------------------------------------------------------------KeyError Traceback (most recent call last)<ipython-input-135-7f5596703652> in <module>
----> 1 data2=data2.set_index('shot_id')2 data2.head(20)C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in set_index(self, keys, drop, append, inplace, verify_integrity)4301 4302 if missing:
-> 4303 raise KeyError(f"None of {missing} are in the columns")4304 4305 if inplace:KeyError: "None of ['shot_id'] are in the columns"
data2.head(20)
|
action_type
|
combined_shot_type
|
game_event_id
|
game_id
|
lat
|
loc_x
|
loc_y
|
lon
|
minutes_remaining
|
period
|
...
|
shot_made_flag
|
shot_type
|
shot_zone_area
|
shot_zone_basic
|
shot_zone_range
|
team_id
|
team_name
|
game_date
|
matchup
|
opponent
|
shot_id
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1
|
Jump Shot
|
Jump Shot
|
10
|
20000012
|
33.9723
|
167
|
72
|
-118.1028
|
10
|
1
|
...
|
NaN
|
2PT Field Goal
|
Right Side(R)
|
Mid-Range
|
16-24 ft.
|
1610612747
|
Los Angeles Lakers
|
2000/10/31
|
LAL @ POR
|
POR
|
2
|
Jump Shot
|
Jump Shot
|
12
|
20000012
|
34.0443
|
-157
|
0
|
-118.4268
|
10
|
1
|
...
|
0.0
|
2PT Field Goal
|
Left Side(L)
|
Mid-Range
|
8-16 ft.
|
1610612747
|
Los Angeles Lakers
|
2000/10/31
|
LAL @ POR
|
POR
|
3
|
Jump Shot
|
Jump Shot
|
35
|
20000012
|
33.9093
|
-101
|
135
|
-118.3708
|
7
|
1
|
...
|
1.0
|
2PT Field Goal
|
Left Side Center(LC)
|
Mid-Range
|
16-24 ft.
|
1610612747
|
Los Angeles Lakers
|
2000/10/31
|
LAL @ POR
|
POR
|
4
|
Jump Shot
|
Jump Shot
|
43
|
20000012
|
33.8693
|
138
|
175
|
-118.1318
|
6
|
1
|
...
|
0.0
|
2PT Field Goal
|
Right Side Center(RC)
|
Mid-Range
|
16-24 ft.
|
1610612747
|
Los Angeles Lakers
|
2000/10/31
|
LAL @ POR
|
POR
|
5
|
Driving Dunk Shot
|
Dunk
|
155
|
20000012
|
34.0443
|
0
|
0
|
-118.2698
|
6
|
2
|
...
|
1.0
|
2PT Field Goal
|
Center(C)
|
Restricted Area
|
Less Than 8 ft.
|
1610612747
|
Los Angeles Lakers
|
2000/10/31
|
LAL @ POR
|
POR
|
6
|
Jump Shot
|
Jump Shot
|
244
|
20000012
|
34.0553
|
-145
|
-11
|
-118.4148
|
9
|
3
|
...
|
0.0
|
2PT Field Goal
|
Left Side(L)
|
Mid-Range
|
8-16 ft.
|
1610612747
|
Los Angeles Lakers
|
2000/10/31
|
LAL @ POR
|
POR
|
7
|
Layup Shot
|
Layup
|
251
|
20000012
|
34.0443
|
0
|
0
|
-118.2698
|
8
|
3
|
...
|
1.0
|
2PT Field Goal
|
Center(C)
|
Restricted Area
|
Less Than 8 ft.
|
1610612747
|
Los Angeles Lakers
|
2000/10/31
|
LAL @ POR
|
POR
|
8
|
Jump Shot
|
Jump Shot
|
254
|
20000012
|
34.0163
|
1
|
28
|
-118.2688
|
8
|
3
|
...
|
NaN
|
2PT Field Goal
|
Center(C)
|
Restricted Area
|
Less Than 8 ft.
|
1610612747
|
Los Angeles Lakers
|
2000/10/31
|
LAL @ POR
|
POR
|
9
|
Jump Shot
|
Jump Shot
|
265
|
20000012
|
33.9363
|
-65
|
108
|
-118.3348
|
6
|
3
|
...
|
1.0
|
2PT Field Goal
|
Left Side(L)
|
In The Paint (Non-RA)
|
8-16 ft.
|
1610612747
|
Los Angeles Lakers
|
2000/10/31
|
LAL @ POR
|
POR
|
10
|
Running Jump Shot
|
Jump Shot
|
294
|
20000012
|
33.9193
|
-33
|
125
|
-118.3028
|
3
|
3
|
...
|
0.0
|
2PT Field Goal
|
Center(C)
|
In The Paint (Non-RA)
|
8-16 ft.
|
1610612747
|
Los Angeles Lakers
|
2000/10/31
|
LAL @ POR
|
POR
|
11
|
Jump Shot
|
Jump Shot
|
309
|
20000012
|
33.8063
|
-94
|
238
|
-118.3638
|
1
|
3
|
...
|
0.0
|
3PT Field Goal
|
Left Side Center(LC)
|
Above the Break 3
|
24+ ft.
|
1610612747
|
Los Angeles Lakers
|
2000/10/31
|
LAL @ POR
|
POR
|
12
|
Jump Shot
|
Jump Shot
|
4
|
20000019
|
33.9173
|
121
|
127
|
-118.1488
|
11
|
1
|
...
|
1.0
|
2PT Field Goal
|
Right Side Center(RC)
|
Mid-Range
|
16-24 ft.
|
1610612747
|
Los Angeles Lakers
|
2000/11/1
|
LAL vs. UTA
|
UTA
|
13
|
Running Jump Shot
|
Jump Shot
|
27
|
20000019
|
33.9343
|
-67
|
110
|
-118.3368
|
7
|
1
|
...
|
1.0
|
2PT Field Goal
|
Left Side(L)
|
In The Paint (Non-RA)
|
8-16 ft.
|
1610612747
|
Los Angeles Lakers
|
2000/11/1
|
LAL vs. UTA
|
UTA
|
14
|
Jump Shot
|
Jump Shot
|
66
|
20000019
|
34.0403
|
-94
|
4
|
-118.3638
|
2
|
1
|
...
|
0.0
|
2PT Field Goal
|
Left Side(L)
|
Mid-Range
|
8-16 ft.
|
1610612747
|
Los Angeles Lakers
|
2000/11/1
|
LAL vs. UTA
|
UTA
|
15
|
Jump Shot
|
Jump Shot
|
80
|
20000019
|
33.9973
|
-23
|
47
|
-118.2928
|
1
|
1
|
...
|
0.0
|
2PT Field Goal
|
Center(C)
|
In The Paint (Non-RA)
|
Less Than 8 ft.
|
1610612747
|
Los Angeles Lakers
|
2000/11/1
|
LAL vs. UTA
|
UTA
|
16
|
Jump Shot
|
Jump Shot
|
86
|
20000019
|
33.8523
|
62
|
192
|
-118.2078
|
0
|
1
|
...
|
0.0
|
2PT Field Goal
|
Center(C)
|
Mid-Range
|
16-24 ft.
|
1610612747
|
Los Angeles Lakers
|
2000/11/1
|
LAL vs. UTA
|
UTA
|
17
|
Driving Layup Shot
|
Layup
|
100
|
20000019
|
34.0443
|
0
|
0
|
-118.2698
|
0
|
1
|
...
|
NaN
|
2PT Field Goal
|
Center(C)
|
Restricted Area
|
Less Than 8 ft.
|
1610612747
|
Los Angeles Lakers
|
2000/11/1
|
LAL vs. UTA
|
UTA
|
18
|
Jump Shot
|
Jump Shot
|
138
|
20000019
|
33.8183
|
-117
|
226
|
-118.3868
|
8
|
2
|
...
|
1.0
|
3PT Field Goal
|
Left Side Center(LC)
|
Above the Break 3
|
24+ ft.
|
1610612747
|
Los Angeles Lakers
|
2000/11/1
|
LAL vs. UTA
|
UTA
|
19
|
Jump Shot
|
Jump Shot
|
244
|
20000019
|
33.9473
|
-132
|
97
|
-118.4018
|
11
|
3
|
...
|
0.0
|
2PT Field Goal
|
Left Side Center(LC)
|
Mid-Range
|
16-24 ft.
|
1610612747
|
Los Angeles Lakers
|
2000/11/1
|
LAL vs. UTA
|
UTA
|
20
|
Driving Layup Shot
|
Layup
|
249
|
20000019
|
34.0443
|
0
|
0
|
-118.2698
|
10
|
3
|
...
|
NaN
|
2PT Field Goal
|
Center(C)
|
Restricted Area
|
Less Than 8 ft.
|
1610612747
|
Los Angeles Lakers
|
2000/11/1
|
LAL vs. UTA
|
UTA
|
20 rows × 24 columns
data2.groupby(by=('action_type','combined_shot_type'))
---------------------------------------------------------------------------KeyError Traceback (most recent call last)<ipython-input-138-e68698f5f785> in <module>
----> 1 data2.groupby(by=('action_type','combined_shot_type'))C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in groupby(self, by, axis, level, as_index, sort, group_keys, squeeze, observed)5808 group_keys=group_keys,5809 squeeze=squeeze,
-> 5810 observed=observed,5811 )5812 C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\groupby\groupby.py in __init__(self, obj, keys, axis, level, grouper, exclusions, selection, as_index, sort, group_keys, squeeze, observed, mutated)407 sort=sort,408 observed=observed,
--> 409 mutated=self.mutated,410 )411 C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\groupby\grouper.py in get_grouper(obj, key, axis, level, sort, observed, mutated, validate)596 in_axis, name, level, gpr = False, None, gpr, None597 else:
--> 598 raise KeyError(gpr)599 elif isinstance(gpr, Grouper) and gpr.key is not None:600 # Add key to exclusionsKeyError: ('action_type', 'combined_shot_type')
data2_join=data2.assign(join=data2['combined_shot_type']+data2['action_type'])
data2_join.head()
|
action_type
|
combined_shot_type
|
game_event_id
|
game_id
|
lat
|
loc_x
|
loc_y
|
lon
|
minutes_remaining
|
period
|
...
|
shot_type
|
shot_zone_area
|
shot_zone_basic
|
shot_zone_range
|
team_id
|
team_name
|
game_date
|
matchup
|
opponent
|
join
|
shot_id
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1
|
Jump Shot
|
Jump Shot
|
10
|
20000012
|
33.9723
|
167
|
72
|
-118.1028
|
10
|
1
|
...
|
2PT Field Goal
|
Right Side(R)
|
Mid-Range
|
16-24 ft.
|
1610612747
|
Los Angeles Lakers
|
2000/10/31
|
LAL @ POR
|
POR
|
Jump ShotJump Shot
|
2
|
Jump Shot
|
Jump Shot
|
12
|
20000012
|
34.0443
|
-157
|
0
|
-118.4268
|
10
|
1
|
...
|
2PT Field Goal
|
Left Side(L)
|
Mid-Range
|
8-16 ft.
|
1610612747
|
Los Angeles Lakers
|
2000/10/31
|
LAL @ POR
|
POR
|
Jump ShotJump Shot
|
3
|
Jump Shot
|
Jump Shot
|
35
|
20000012
|
33.9093
|
-101
|
135
|
-118.3708
|
7
|
1
|
...
|
2PT Field Goal
|
Left Side Center(LC)
|
Mid-Range
|
16-24 ft.
|
1610612747
|
Los Angeles Lakers
|
2000/10/31
|
LAL @ POR
|
POR
|
Jump ShotJump Shot
|
4
|
Jump Shot
|
Jump Shot
|
43
|
20000012
|
33.8693
|
138
|
175
|
-118.1318
|
6
|
1
|
...
|
2PT Field Goal
|
Right Side Center(RC)
|
Mid-Range
|
16-24 ft.
|
1610612747
|
Los Angeles Lakers
|
2000/10/31
|
LAL @ POR
|
POR
|
Jump ShotJump Shot
|
5
|
Driving Dunk Shot
|
Dunk
|
155
|
20000012
|
34.0443
|
0
|
0
|
-118.2698
|
6
|
2
|
...
|
2PT Field Goal
|
Center(C)
|
Restricted Area
|
Less Than 8 ft.
|
1610612747
|
Los Angeles Lakers
|
2000/10/31
|
LAL @ POR
|
POR
|
DunkDriving Dunk Shot
|
5 rows × 25 columns
data2_join1=data2['combined_shot_type']+'+'+data2['action_type']
data2_join1.head(100)
shot_id
1 Jump Shot+Jump Shot
2 Jump Shot+Jump Shot
3 Jump Shot+Jump Shot
4 Jump Shot+Jump Shot
5 Dunk+Driving Dunk Shot...
96 Jump Shot+Jump Shot
97 Jump Shot+Jump Shot
98 Jump Shot+Jump Shot
99 Jump Shot+Jump Shot
100 Jump Shot+Jump Shot
Length: 100, dtype: object
data2_join1.nunique()
57
data2_join1.value_counts()
#最多的是Jump Shot+Jump Shot
Jump Shot+Jump Shot 18880
Layup+Layup Shot 2567
Layup+Driving Layup Shot 1978
Jump Shot+Turnaround Jump Shot 1057
Jump Shot+Fadeaway Jump Shot 1048
Jump Shot+Running Jump Shot 926
Jump Shot+Pullup Jump shot 476
Jump Shot+Turnaround Fadeaway shot 439
Dunk+Slam Dunk Shot 411
Layup+Reverse Layup Shot 395
Jump Shot+Jump Bank Shot 333
Dunk+Driving Dunk Shot 310
Dunk+Dunk Shot 262
Tip Shot+Tip Shot 182
Dunk+Alley Oop Dunk Shot 122
Jump Shot+Step Back Jump shot 118
Jump Shot+Floating Jump shot 114
Layup+Driving Reverse Layup Shot 97
Hook Shot+Hook Shot 84
Layup+Driving Finger Roll Shot 82
Layup+Alley Oop Layup shot 80
Dunk+Reverse Dunk Shot 75
Layup+Running Layup Shot 72
Bank Shot+Turnaround Bank shot 71
Layup+Driving Finger Roll Layup Shot 69
Bank Shot+Running Bank shot 48
Dunk+Driving Slam Dunk Shot 48
Hook Shot+Running Hook Shot 41
Layup+Finger Roll Layup Shot 33
Jump Shot+Fadeaway Bank shot 31
Layup+Finger Roll Shot 28
Jump Shot+Driving Jump shot 28
Jump Shot+Jump Hook Shot 24
Dunk+Running Dunk Shot 19
Dunk+Reverse Slam Dunk Shot 16
Layup+Putback Layup Shot 15
Dunk+Follow Up Dunk Shot 15
Hook Shot+Turnaround Hook Shot 14
Hook Shot+Driving Hook Shot 14
Bank Shot+Pullup Bank shot 12
Layup+Running Reverse Layup Shot 11
Layup+Running Finger Roll Layup Shot 6
Layup+Cutting Layup Shot 6
Bank Shot+Driving Bank shot 5
Bank Shot+Hook Bank Shot 5
Dunk+Putback Dunk Shot 5
Jump Shot+Driving Floating Jump Shot 5
Layup+Running Finger Roll Shot 4
Jump Shot+Running Pull-Up Jump Shot 4
Layup+Turnaround Finger Roll Shot 2
Tip Shot+Running Tip Shot 2
Layup+Tip Layup Shot 2
Dunk+Putback Slam Dunk Shot 2
Dunk+Running Slam Dunk Shot 1
Jump Shot+Turnaround Fadeaway Bank Jump Shot 1
Jump Shot+Driving Floating Bank Jump Shot 1
Layup+Cutting Finger Roll Layup Shot 1
dtype: int64
data2['opponent'].value_counts()
#遭遇最多的是SAS
SAS 1978
PHX 1781
HOU 1666
SAC 1643
DEN 1642
POR 1539
UTA 1490
MIN 1474
GSW 1356
LAC 1285
DAL 1199
MEM 1030
BOS 946
SEA 828
IND 761
PHI 720
ORL 719
DET 715
OKC 677
TOR 664
NYK 657
MIA 627
CHA 620
CLE 619
CHI 610
WAS 600
MIL 586
NOH 581
NJN 520
ATL 519
NOP 344
VAN 246
BKN 55
Name: opponent, dtype: int64
经验总结
本次是第二次参加组队学习,这次是针对pandas的学习
在这个练习中暂时没有遇到什么问题,除了在课后习题中第一题的第三问做不出来,之后等学习到那个时候在解决。其次是自己的解决方案和参考答案方法不同,但结果相同
pandas 第十二期组队-pandas基础相关推荐
- Datawhale第十二期组队学习--Python爬虫编程实践 Task03:session和cookie、代理、selenium自动化 拔高:丁香园留言板爬取
一. session和cookie 常用的 web 容器有 Nginx . Apache . Tomcat .Weblogic . Jboss . Resin 等等 http 1.0 HTTP1.0的 ...
- Python 数据分析三剑客之 Pandas(十):数据读写
CSDN 课程推荐:<迈向数据科学家:带你玩转Python数据分析>,讲师齐伟,苏州研途教育科技有限公司CTO,苏州大学应用统计专业硕士生指导委员会委员:已出版<跟老齐学Python ...
- linux pandas教程_十分钟入门 Pandas
# 十分钟入门 Pandas 本节是帮助 Pandas 新手快速上手的简介.烹饪指南里介绍了更多实用案例. 本节以下列方式导入 Pandas 与 NumPy: In [1]: import numpy ...
- python3行3列文件数据赋值_Python3快速入门(十四)——Pandas数据读取
Python3快速入门(十四)--Pandas数据读取 一.DataFrame IO 1.CSV文件 pandas.read_csv(filepath_or_buffer, na_values='NA ...
- Python3快速入门(十四)——Pandas数据读取
Python3快速入门(十四)--Pandas数据读取 一.DataFrame IO 1.CSV文件 pandas.read_csv(filepath_or_buffer, na_values='NA ...
- pandas 对某一行标准化_Python中的神器Pandas,但是有人说Pandas慢...
如果你从事大数据工作,用Python的Pandas库时会发现很多惊喜.Pandas在数据科学和分析领域扮演越来越重要的角色,尤其是对于从Excel和VBA转向Python的用户. 所以,对于数据科学家 ...
- python中pandas是指什么_Python中的神器Pandas,但是有人说Pandas慢...
原标题:Python中的神器Pandas,但是有人说Pandas慢... 如果你从事大数据工作,用Python的Pandas库时会发现很多惊喜.Pandas在数据科学和分析领域扮演越来越重要的角色,尤 ...
- 微信小程序周报(第十二期)-极乐小程序商店出品
重要:极乐商店域名变更:wxapp.dreawer.com/变更为store.dreawer.com/ 每周一笑 对于各种凌乱的电脑问题,手机问题,其他行业的人,以为程序员们,什么都会,程序员中,女程 ...
- 平安科技移动开发二队技术周报(第十二期)
平安科技移动开发二队技术周报(第十二期) @author ASCE1885的 Github 简书 微博 CSDN 业界新闻 1)无人机将带领程序员进入一个新开发者时代 将无人机和不同行业结合起来想象空 ...
最新文章
- 特征选择---文本分类:叉方统计量
- win10添加新用户
- Lazada2022新开店入驻流程及费用
- 生于拉新,死于留存!如何找到 Aha moment?
- BUG总结——【构造函数写逻辑】引发的极大的线上问题
- python购物车典例_Python入门:购物车实例
- 软件工程心理学之9----乙方如何面对甲方2
- 第13章 程序的动态加载和执行(二,用户代码)
- Vrep之configuration space之间和pose之间距离的计算方法
- 嘉年华回顾丨周振兴带你解密POLARDB产品架构与实现
- mysql 服务已经存在_windows下mysql主从同步时从服务器出现表已经存在解决办法
- 《CCIE路由和交换认证考试指南(第5版) (第2卷)》——1.6节理解与记忆
- 阿里OSS对象存储,实现图片上传代码;
- php爬取网站所有链接,php 爬取超链接
- Spring RestTemplate的使用示例
- atitit.DD dragdrop拖拽文件到界面功能 html5 web 跟个java swing c#.net c++ 的总结
- Jira 破解版 Docker 部署
- JSP数据库增删改查
- c语言怎么输入加减符号,C语言有符号加减溢出
- SQL 2008函数大全
热门文章
- 用python爬取实时基金估值
- 电阻、电容选型的要点
- 项目中引入阿里巴巴矢量库(unicode)
- C语言程序设计(三)高级特性
- python sdk是什么意思_sdk啥意思
- Android 设置黑白主题
- 微信电脑端双击不能放大图片怎么回事_电脑微信收到的截图是这样子的,双击一下打开才看的到,我自己也发不出截图...
- 力扣977有序数组的平方(c++版)
- Dicom 图像--像素值(灰度值)转为CT值
- 【我与云表的故事】我用云表做工作日历