import numpy as np
import pandas as pd
#构造数据表
df = pd.DataFrame({"id":[1001,1002,1003,1004,1005,1006], "date":pd.date_range('20130102', periods=6),"city":['Beijing ', 'SH', ' guangzhou ', 'Shenzhen', 'shanghai', 'BEIJING '],"age":[23,44,54,32,34,32],"category":['100-A','100-B','110-A','110-C','210-A','130-F'],"price":[1200,np.nan,2133,5433,np.nan,4432]}, columns =['id','date','city','category','age','price'])
df
|
id
|
date
|
city
|
category
|
age
|
price
|
0
|
1001
|
2013-01-02
|
Beijing
|
100-A
|
23
|
1200.0
|
1
|
1002
|
2013-01-03
|
SH
|
100-B
|
44
|
NaN
|
2
|
1003
|
2013-01-04
|
guangzhou
|
110-A
|
54
|
2133.0
|
3
|
1004
|
2013-01-05
|
Shenzhen
|
110-C
|
32
|
5433.0
|
4
|
1005
|
2013-01-06
|
shanghai
|
210-A
|
34
|
NaN
|
5
|
1006
|
2013-01-07
|
BEIJING
|
130-F
|
32
|
4432.0
|
df.shape#维度
(6, 6)
df.info()#维度、列名称、数据格式、所占空间等
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 6 columns):
id 6 non-null int64
date 6 non-null datetime64[ns]
city 6 non-null object
category 6 non-null object
age 6 non-null int64
price 4 non-null float64
dtypes: datetime64[ns](1), float64(1), int64(2), object(2)
memory usage: 368.0+ bytes
df.dtypes#每一列数据的格式
id int64
date datetime64[ns]
city object
category object
age int64
price float64
dtype: object
df['price'].dtype#格式
dtype('float64')
df['price'].isnull()#看空值
0 False
1 True
2 False
3 False
4 True
5 False
Name: price, dtype: bool
df['price'].unique()#查看某一列的唯一值
df['city'].unique()
array(['Beijing ', 'SH', ' guangzhou ', 'Shenzhen', 'shanghai','BEIJING '], dtype=object)
df['price'].value_counts()#计数值value_counts()
4432.0 1
5433.0 1
2133.0 1
1200.0 1
Name: price, dtype: int64
df.values#转numpy的ndarray格式
array([[1001, Timestamp('2013-01-02 00:00:00'), 'Beijing ', '100-A', 23,1200.0],[1002, Timestamp('2013-01-03 00:00:00'), 'SH', '100-B', 44, nan],[1003, Timestamp('2013-01-04 00:00:00'), ' guangzhou ', '110-A',54, 2133.0],[1004, Timestamp('2013-01-05 00:00:00'), 'Shenzhen', '110-C', 32,5433.0],[1005, Timestamp('2013-01-06 00:00:00'), 'shanghai', '210-A', 34,nan],[1006, Timestamp('2013-01-07 00:00:00'), 'BEIJING ', '130-F', 32,4432.0]], dtype=object)
df.columns#看列名称
Index(['id', 'date', 'city', 'category', 'age', 'price'], dtype='object')
df.head()#前5行
|
id
|
date
|
city
|
category
|
age
|
price
|
0
|
1001
|
2013-01-02
|
Beijing
|
100-A
|
23
|
1200.0
|
1
|
1002
|
2013-01-03
|
SH
|
100-B
|
44
|
NaN
|
2
|
1003
|
2013-01-04
|
guangzhou
|
110-A
|
54
|
2133.0
|
3
|
1004
|
2013-01-05
|
Shenzhen
|
110-C
|
32
|
5433.0
|
4
|
1005
|
2013-01-06
|
shanghai
|
210-A
|
34
|
NaN
|
df.tail()#后五行
|
id
|
date
|
city
|
category
|
age
|
price
|
1
|
1002
|
2013-01-03
|
SH
|
100-B
|
44
|
NaN
|
2
|
1003
|
2013-01-04
|
guangzhou
|
110-A
|
54
|
2133.0
|
3
|
1004
|
2013-01-05
|
Shenzhen
|
110-C
|
32
|
5433.0
|
4
|
1005
|
2013-01-06
|
shanghai
|
210-A
|
34
|
NaN
|
5
|
1006
|
2013-01-07
|
BEIJING
|
130-F
|
32
|
4432.0
|
数据表清洗
df.fillna(value=0)#0填充
|
id
|
date
|
city
|
category
|
age
|
price
|
0
|
1001
|
2013-01-02
|
Beijing
|
100-A
|
23
|
1200.0
|
1
|
1002
|
2013-01-03
|
SH
|
100-B
|
44
|
0.0
|
2
|
1003
|
2013-01-04
|
guangzhou
|
110-A
|
54
|
2133.0
|
3
|
1004
|
2013-01-05
|
Shenzhen
|
110-C
|
32
|
5433.0
|
4
|
1005
|
2013-01-06
|
shanghai
|
210-A
|
34
|
0.0
|
5
|
1006
|
2013-01-07
|
BEIJING
|
130-F
|
32
|
4432.0
|
df['price']=df['price'].fillna(df['price'].mean())#均值填充
df
|
id
|
date
|
city
|
category
|
age
|
price
|
0
|
1001
|
2013-01-02
|
Beijing
|
100-A
|
23
|
1200.0
|
1
|
1002
|
2013-01-03
|
SH
|
100-B
|
44
|
3299.5
|
2
|
1003
|
2013-01-04
|
guangzhou
|
110-A
|
54
|
2133.0
|
3
|
1004
|
2013-01-05
|
Shenzhen
|
110-C
|
32
|
5433.0
|
4
|
1005
|
2013-01-06
|
shanghai
|
210-A
|
34
|
3299.5
|
5
|
1006
|
2013-01-07
|
BEIJING
|
130-F
|
32
|
4432.0
|
print(df['city'])
df['city']=df['city'].map(str.strip)#清除city字段的字符空格,看Beijing 后边和guangzhou前边
print(df['city'])
0 Beijing
1 SH
2 guangzhou
3 Shenzhen
4 shanghai
5 BEIJING
Name: city, dtype: object
0 Beijing
1 SH
2 guangzhou
3 Shenzhen
4 shanghai
5 BEIJING
Name: city, dtype: object
df['city']=df['city'].str.upper()#转成全大写,lower小写
df
|
id
|
date
|
city
|
category
|
age
|
price
|
0
|
1001
|
2013-01-02
|
BEIJING
|
100-A
|
23
|
1200.0
|
1
|
1002
|
2013-01-03
|
SH
|
100-B
|
44
|
3299.5
|
2
|
1003
|
2013-01-04
|
GUANGZHOU
|
110-A
|
54
|
2133.0
|
3
|
1004
|
2013-01-05
|
SHENZHEN
|
110-C
|
32
|
5433.0
|
4
|
1005
|
2013-01-06
|
SHANGHAI
|
210-A
|
34
|
3299.5
|
5
|
1006
|
2013-01-07
|
BEIJING
|
130-F
|
32
|
4432.0
|
df['price'].astype('int')#更改数据格式
0 1200
1 3299
2 2133
3 5433
4 3299
5 4432
Name: price, dtype: int32
df.rename(columns={'category': 'category-size'}) #更改列名称
|
id
|
date
|
city
|
category-size
|
age
|
price
|
0
|
1001
|
2013-01-02
|
BEIJING
|
100-A
|
23
|
1200.0
|
1
|
1002
|
2013-01-03
|
SH
|
100-B
|
44
|
3299.5
|
2
|
1003
|
2013-01-04
|
GUANGZHOU
|
110-A
|
54
|
2133.0
|
3
|
1004
|
2013-01-05
|
SHENZHEN
|
110-C
|
32
|
5433.0
|
4
|
1005
|
2013-01-06
|
SHANGHAI
|
210-A
|
34
|
3299.5
|
5
|
1006
|
2013-01-07
|
BEIJING
|
130-F
|
32
|
4432.0
|
# df['city'].drop_duplicates()#删除后出现的重复值
# df
df['city'].replace('sh','shanghai')#数据替换
0 BEIJING
1 SH
2 GUANGZHOU
3 SHENZHEN
4 SHANGHAI
5 BEIJING
Name: city, dtype: object
数据预处理
df1=pd.DataFrame({"id":[1002,1001,1003,1004,1005,1006,1007,1008],
"gender":['male','female','male','female','male','female','male','female'],
"pay":['Y','N','Y','Y','N','Y','N','Y',],
"m-point":[10,12,20,40,40,40,30,20]})
df1
|
id
|
gender
|
pay
|
m-point
|
0
|
1002
|
male
|
Y
|
10
|
1
|
1001
|
female
|
N
|
12
|
2
|
1003
|
male
|
Y
|
20
|
3
|
1004
|
female
|
Y
|
40
|
4
|
1005
|
male
|
N
|
40
|
5
|
1006
|
female
|
Y
|
40
|
6
|
1007
|
male
|
N
|
30
|
7
|
1008
|
female
|
Y
|
20
|
merge
# 匹配合并,交集,以id为标识
df_inner=pd.merge(df,df1,how='inner',on='id')
df_inner.head(10)
|
id
|
date
|
city
|
category
|
age
|
price
|
gender
|
pay
|
m-point
|
0
|
1001
|
2013-01-02
|
BEIJING
|
100-A
|
23
|
1200.0
|
female
|
N
|
12
|
1
|
1002
|
2013-01-03
|
SH
|
100-B
|
44
|
3299.5
|
male
|
Y
|
10
|
2
|
1003
|
2013-01-04
|
GUANGZHOU
|
110-A
|
54
|
2133.0
|
male
|
Y
|
20
|
3
|
1004
|
2013-01-05
|
SHENZHEN
|
110-C
|
32
|
5433.0
|
female
|
Y
|
40
|
4
|
1005
|
2013-01-06
|
SHANGHAI
|
210-A
|
34
|
3299.5
|
male
|
N
|
40
|
5
|
1006
|
2013-01-07
|
BEIJING
|
130-F
|
32
|
4432.0
|
female
|
Y
|
40
|
# 匹配合并,左边为重
df_left=pd.merge(df,df1,how='left')
df_left
|
id
|
date
|
city
|
category
|
age
|
price
|
gender
|
pay
|
m-point
|
0
|
1001
|
2013-01-02
|
BEIJING
|
100-A
|
23
|
1200.0
|
female
|
N
|
12
|
1
|
1002
|
2013-01-03
|
SH
|
100-B
|
44
|
3299.5
|
male
|
Y
|
10
|
2
|
1003
|
2013-01-04
|
GUANGZHOU
|
110-A
|
54
|
2133.0
|
male
|
Y
|
20
|
3
|
1004
|
2013-01-05
|
SHENZHEN
|
110-C
|
32
|
5433.0
|
female
|
Y
|
40
|
4
|
1005
|
2013-01-06
|
SHANGHAI
|
210-A
|
34
|
3299.5
|
male
|
N
|
40
|
5
|
1006
|
2013-01-07
|
BEIJING
|
130-F
|
32
|
4432.0
|
female
|
Y
|
40
|
# 匹配合并,右边为基准
df_right=pd.merge(df,df1,how='right')
df_right
|
id
|
date
|
city
|
category
|
age
|
price
|
gender
|
pay
|
m-point
|
0
|
1001
|
2013-01-02
|
BEIJING
|
100-A
|
23.0
|
1200.0
|
female
|
N
|
12
|
1
|
1002
|
2013-01-03
|
SH
|
100-B
|
44.0
|
3299.5
|
male
|
Y
|
10
|
2
|
1003
|
2013-01-04
|
GUANGZHOU
|
110-A
|
54.0
|
2133.0
|
male
|
Y
|
20
|
3
|
1004
|
2013-01-05
|
SHENZHEN
|
110-C
|
32.0
|
5433.0
|
female
|
Y
|
40
|
4
|
1005
|
2013-01-06
|
SHANGHAI
|
210-A
|
34.0
|
3299.5
|
male
|
N
|
40
|
5
|
1006
|
2013-01-07
|
BEIJING
|
130-F
|
32.0
|
4432.0
|
female
|
Y
|
40
|
6
|
1007
|
NaT
|
NaN
|
NaN
|
NaN
|
NaN
|
male
|
N
|
30
|
7
|
1008
|
NaT
|
NaN
|
NaN
|
NaN
|
NaN
|
female
|
Y
|
20
|
# 匹配合并,并集
df_outer=pd.merge(df,df1,how='outer')
df_outer
|
id
|
date
|
city
|
category
|
age
|
price
|
gender
|
pay
|
m-point
|
0
|
1001
|
2013-01-02
|
BEIJING
|
100-A
|
23.0
|
1200.0
|
female
|
N
|
12
|
1
|
1002
|
2013-01-03
|
SH
|
100-B
|
44.0
|
3299.5
|
male
|
Y
|
10
|
2
|
1003
|
2013-01-04
|
GUANGZHOU
|
110-A
|
54.0
|
2133.0
|
male
|
Y
|
20
|
3
|
1004
|
2013-01-05
|
SHENZHEN
|
110-C
|
32.0
|
5433.0
|
female
|
Y
|
40
|
4
|
1005
|
2013-01-06
|
SHANGHAI
|
210-A
|
34.0
|
3299.5
|
male
|
N
|
40
|
5
|
1006
|
2013-01-07
|
BEIJING
|
130-F
|
32.0
|
4432.0
|
female
|
Y
|
40
|
6
|
1007
|
NaT
|
NaN
|
NaN
|
NaN
|
NaN
|
male
|
N
|
30
|
7
|
1008
|
NaT
|
NaN
|
NaN
|
NaN
|
NaN
|
female
|
Y
|
20
|
append
result = df1.append(df)#上下相加
result
G:\Anaconda\lib\site-packages\pandas\core\frame.py:6211: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.To accept the future behavior, pass 'sort=False'.To retain the current behavior and silence the warning, pass 'sort=True'.sort=sort)
|
age
|
category
|
city
|
date
|
gender
|
id
|
m-point
|
pay
|
price
|
0
|
NaN
|
NaN
|
NaN
|
NaT
|
male
|
1002
|
10.0
|
Y
|
NaN
|
1
|
NaN
|
NaN
|
NaN
|
NaT
|
female
|
1001
|
12.0
|
N
|
NaN
|
2
|
NaN
|
NaN
|
NaN
|
NaT
|
male
|
1003
|
20.0
|
Y
|
NaN
|
3
|
NaN
|
NaN
|
NaN
|
NaT
|
female
|
1004
|
40.0
|
Y
|
NaN
|
4
|
NaN
|
NaN
|
NaN
|
NaT
|
male
|
1005
|
40.0
|
N
|
NaN
|
5
|
NaN
|
NaN
|
NaN
|
NaT
|
female
|
1006
|
40.0
|
Y
|
NaN
|
6
|
NaN
|
NaN
|
NaN
|
NaT
|
male
|
1007
|
30.0
|
N
|
NaN
|
7
|
NaN
|
NaN
|
NaN
|
NaT
|
female
|
1008
|
20.0
|
Y
|
NaN
|
0
|
23.0
|
100-A
|
BEIJING
|
2013-01-02
|
NaN
|
1001
|
NaN
|
NaN
|
1200.0
|
1
|
44.0
|
100-B
|
SH
|
2013-01-03
|
NaN
|
1002
|
NaN
|
NaN
|
3299.5
|
2
|
54.0
|
110-A
|
GUANGZHOU
|
2013-01-04
|
NaN
|
1003
|
NaN
|
NaN
|
2133.0
|
3
|
32.0
|
110-C
|
SHENZHEN
|
2013-01-05
|
NaN
|
1004
|
NaN
|
NaN
|
5433.0
|
4
|
34.0
|
210-A
|
SHANGHAI
|
2013-01-06
|
NaN
|
1005
|
NaN
|
NaN
|
3299.5
|
5
|
32.0
|
130-F
|
BEIJING
|
2013-01-07
|
NaN
|
1006
|
NaN
|
NaN
|
4432.0
|
join
left=pd.DataFrame({'a':[1,2,3],'b':[4,6,7]})
left
right=pd.DataFrame({'c':[1,1,1],'d':[2,2,2]})
right
result=left.join(right,how='inner')
result
|
a
|
b
|
c
|
d
|
0
|
1
|
4
|
1
|
2
|
1
|
2
|
6
|
1
|
2
|
2
|
3
|
7
|
1
|
2
|
concat
frames=[left,right,result]
result1=pd.concat(frames)
result1
G:\Anaconda\lib\site-packages\ipykernel_launcher.py:2: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.To accept the future behavior, pass 'sort=False'.To retain the current behavior and silence the warning, pass 'sort=True'.
|
a
|
b
|
c
|
d
|
0
|
1.0
|
4.0
|
NaN
|
NaN
|
1
|
2.0
|
6.0
|
NaN
|
NaN
|
2
|
3.0
|
7.0
|
NaN
|
NaN
|
0
|
NaN
|
NaN
|
1.0
|
2.0
|
1
|
NaN
|
NaN
|
1.0
|
2.0
|
2
|
NaN
|
NaN
|
1.0
|
2.0
|
0
|
1.0
|
4.0
|
1.0
|
2.0
|
1
|
2.0
|
6.0
|
1.0
|
2.0
|
2
|
3.0
|
7.0
|
1.0
|
2.0
|
df
|
id
|
date
|
city
|
category
|
age
|
price
|
0
|
1001
|
2013-01-02
|
BEIJING
|
100-A
|
23
|
1200.0
|
1
|
1002
|
2013-01-03
|
SH
|
100-B
|
44
|
3299.5
|
2
|
1003
|
2013-01-04
|
GUANGZHOU
|
110-A
|
54
|
2133.0
|
3
|
1004
|
2013-01-05
|
SHENZHEN
|
110-C
|
32
|
5433.0
|
4
|
1005
|
2013-01-06
|
SHANGHAI
|
210-A
|
34
|
3299.5
|
5
|
1006
|
2013-01-07
|
BEIJING
|
130-F
|
32
|
4432.0
|
df_inner.set_index('id')#设置id为索引列
|
date
|
city
|
category
|
age
|
price
|
gender
|
pay
|
m-point
|
id
|
|
|
|
|
|
|
|
|
1001
|
2013-01-02
|
BEIJING
|
100-A
|
23
|
1200.0
|
female
|
N
|
12
|
1002
|
2013-01-03
|
SH
|
100-B
|
44
|
3299.5
|
male
|
Y
|
10
|
1003
|
2013-01-04
|
GUANGZHOU
|
110-A
|
54
|
2133.0
|
male
|
Y
|
20
|
1004
|
2013-01-05
|
SHENZHEN
|
110-C
|
32
|
5433.0
|
female
|
Y
|
40
|
1005
|
2013-01-06
|
SHANGHAI
|
210-A
|
34
|
3299.5
|
male
|
N
|
40
|
1006
|
2013-01-07
|
BEIJING
|
130-F
|
32
|
4432.0
|
female
|
Y
|
40
|
df.sort_values(by=['age'])#按照特定列的值排序,age
|
id
|
date
|
city
|
category
|
age
|
price
|
0
|
1001
|
2013-01-02
|
BEIJING
|
100-A
|
23
|
1200.0
|
3
|
1004
|
2013-01-05
|
SHENZHEN
|
110-C
|
32
|
5433.0
|
5
|
1006
|
2013-01-07
|
BEIJING
|
130-F
|
32
|
4432.0
|
4
|
1005
|
2013-01-06
|
SHANGHAI
|
210-A
|
34
|
3299.5
|
1
|
1002
|
2013-01-03
|
SH
|
100-B
|
44
|
3299.5
|
2
|
1003
|
2013-01-04
|
GUANGZHOU
|
110-A
|
54
|
2133.0
|
df.sort_index()#按照索引列排序
|
id
|
date
|
city
|
category
|
age
|
price
|
0
|
1001
|
2013-01-02
|
BEIJING
|
100-A
|
23
|
1200.0
|
1
|
1002
|
2013-01-03
|
SH
|
100-B
|
44
|
3299.5
|
2
|
1003
|
2013-01-04
|
GUANGZHOU
|
110-A
|
54
|
2133.0
|
3
|
1004
|
2013-01-05
|
SHENZHEN
|
110-C
|
32
|
5433.0
|
4
|
1005
|
2013-01-06
|
SHANGHAI
|
210-A
|
34
|
3299.5
|
5
|
1006
|
2013-01-07
|
BEIJING
|
130-F
|
32
|
4432.0
|
df['group']=np.where(df['price']>3000,'high','low')
df
#如果prince列的值>3000,group列显示high,否则显示low
|
id
|
date
|
city
|
category
|
age
|
price
|
group
|
0
|
1001
|
2013-01-02
|
BEIJING
|
100-A
|
23
|
1200.0
|
low
|
1
|
1002
|
2013-01-03
|
SH
|
100-B
|
44
|
3299.5
|
high
|
2
|
1003
|
2013-01-04
|
GUANGZHOU
|
110-A
|
54
|
2133.0
|
low
|
3
|
1004
|
2013-01-05
|
SHENZHEN
|
110-C
|
32
|
5433.0
|
high
|
4
|
1005
|
2013-01-06
|
SHANGHAI
|
210-A
|
34
|
3299.5
|
high
|
5
|
1006
|
2013-01-07
|
BEIJING
|
130-F
|
32
|
4432.0
|
high
|
df_inner
|
id
|
date
|
city
|
category
|
age
|
price
|
gender
|
pay
|
m-point
|
0
|
1001
|
2013-01-02
|
BEIJING
|
100-A
|
23
|
1200.0
|
female
|
N
|
12
|
1
|
1002
|
2013-01-03
|
SH
|
100-B
|
44
|
3299.5
|
male
|
Y
|
10
|
2
|
1003
|
2013-01-04
|
GUANGZHOU
|
110-A
|
54
|
2133.0
|
male
|
Y
|
20
|
3
|
1004
|
2013-01-05
|
SHENZHEN
|
110-C
|
32
|
5433.0
|
female
|
Y
|
40
|
4
|
1005
|
2013-01-06
|
SHANGHAI
|
210-A
|
34
|
3299.5
|
male
|
N
|
40
|
5
|
1006
|
2013-01-07
|
BEIJING
|
130-F
|
32
|
4432.0
|
female
|
Y
|
40
|
#对复合多个条件的数据进行分组标记
df_inner.loc[(df_inner['city']=='BEIJING')&(df_inner['price']>=4000),'sign']=1
df_inner
|
id
|
date
|
city
|
category
|
age
|
price
|
gender
|
pay
|
m-point
|
sign
|
0
|
1001
|
2013-01-02
|
BEIJING
|
100-A
|
23
|
1200.0
|
female
|
N
|
12
|
NaN
|
1
|
1002
|
2013-01-03
|
SH
|
100-B
|
44
|
3299.5
|
male
|
Y
|
10
|
NaN
|
2
|
1003
|
2013-01-04
|
GUANGZHOU
|
110-A
|
54
|
2133.0
|
male
|
Y
|
20
|
NaN
|
3
|
1004
|
2013-01-05
|
SHENZHEN
|
110-C
|
32
|
5433.0
|
female
|
Y
|
40
|
NaN
|
4
|
1005
|
2013-01-06
|
SHANGHAI
|
210-A
|
34
|
3299.5
|
male
|
N
|
40
|
NaN
|
5
|
1006
|
2013-01-07
|
BEIJING
|
130-F
|
32
|
4432.0
|
female
|
Y
|
40
|
1.0
|
# pd.DataFrame((x.split('-')for x in df_inner['category']),index=df_inner.index,columns=['category','size'])
数据提取 loc,iloc和ix,loc函数按标签值进行提取,iloc按位置进行提取,ix可以同时按标签和位置进行提取.
df_inner.loc[3]#按索引提取单行的数值
id 1004
date 2013-01-05 00:00:00
city SHENZHEN
category 110-C
age 32
price 5433
gender female
pay Y
m-point 40
sign NaN
Name: 3, dtype: object
df_inner.iloc[0:5]#按索引提取区域行数值
|
id
|
date
|
city
|
category
|
age
|
price
|
gender
|
pay
|
m-point
|
sign
|
0
|
1001
|
2013-01-02
|
BEIJING
|
100-A
|
23
|
1200.0
|
female
|
N
|
12
|
NaN
|
1
|
1002
|
2013-01-03
|
SH
|
100-B
|
44
|
3299.5
|
male
|
Y
|
10
|
NaN
|
2
|
1003
|
2013-01-04
|
GUANGZHOU
|
110-A
|
54
|
2133.0
|
male
|
Y
|
20
|
NaN
|
3
|
1004
|
2013-01-05
|
SHENZHEN
|
110-C
|
32
|
5433.0
|
female
|
Y
|
40
|
NaN
|
4
|
1005
|
2013-01-06
|
SHANGHAI
|
210-A
|
34
|
3299.5
|
male
|
N
|
40
|
NaN
|
df_inner=df_inner.set_index('date') #设置日期为索引
df_inner.iloc[[0,2,5],[4,5]]#提取第0、2、5行,4、5列
|
price
|
gender
|
date
|
|
|
2013-01-02
|
1200.0
|
female
|
2013-01-04
|
2133.0
|
male
|
2013-01-07
|
4432.0
|
female
|
df_inner.iloc[:3,:2]#冒号前后的数字不再是索引的标签名称,而是数据所在的位置,从0开始,前三行,前两列
|
id
|
city
|
date
|
|
|
2013-01-02
|
1001
|
BEIJING
|
2013-01-03
|
1002
|
SH
|
2013-01-04
|
1003
|
GUANGZHOU
|
df_inner.ix[:'2013-01-03',:4]#使用ix按索引标签和位置混合提取数据,2013-01-03号之前,前四列数据
G:\Anaconda\lib\site-packages\ipykernel_launcher.py:1: DeprecationWarning:
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexingSee the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated"""Entry point for launching an IPython kernel.
|
id
|
city
|
category
|
age
|
date
|
|
|
|
|
2013-01-02
|
1001
|
BEIJING
|
100-A
|
23
|
2013-01-03
|
1002
|
SH
|
100-B
|
44
|
df_inner.loc[df_inner['city'].isin(['BEIJING','SHANGHAI'])]#判断city列里是否包含beijing和shanghai,然后将符合条件的数据提取出来
|
id
|
city
|
category
|
age
|
price
|
gender
|
pay
|
m-point
|
sign
|
date
|
|
|
|
|
|
|
|
|
|
2013-01-02
|
1001
|
BEIJING
|
100-A
|
23
|
1200.0
|
female
|
N
|
12
|
NaN
|
2013-01-06
|
1005
|
SHANGHAI
|
210-A
|
34
|
3299.5
|
male
|
N
|
40
|
NaN
|
2013-01-07
|
1006
|
BEIJING
|
130-F
|
32
|
4432.0
|
female
|
Y
|
40
|
1.0
|
pd.DataFrame(df_inner["category"].str[:3])#提取前三个字符,并生成数据表
|
category
|
date
|
|
2013-01-02
|
100
|
2013-01-03
|
100
|
2013-01-04
|
110
|
2013-01-05
|
110
|
2013-01-06
|
210
|
2013-01-07
|
130
|
pd.DataFrame(df_inner["category"].str[-1])#提取最后一个字符,并生成数据表
|
category
|
date
|
|
2013-01-02
|
A
|
2013-01-03
|
B
|
2013-01-04
|
A
|
2013-01-05
|
C
|
2013-01-06
|
A
|
2013-01-07
|
F
|
数据筛选
使用与、或、非三个条件配合大于、小于、等于对数据进行筛选,并进行计数和求和
df_inner
|
id
|
city
|
category
|
age
|
price
|
gender
|
pay
|
m-point
|
sign
|
date
|
|
|
|
|
|
|
|
|
|
2013-01-02
|
1001
|
BEIJING
|
100-A
|
23
|
1200.0
|
female
|
N
|
12
|
NaN
|
2013-01-03
|
1002
|
SH
|
100-B
|
44
|
3299.5
|
male
|
Y
|
10
|
NaN
|
2013-01-04
|
1003
|
GUANGZHOU
|
110-A
|
54
|
2133.0
|
male
|
Y
|
20
|
NaN
|
2013-01-05
|
1004
|
SHENZHEN
|
110-C
|
32
|
5433.0
|
female
|
Y
|
40
|
NaN
|
2013-01-06
|
1005
|
SHANGHAI
|
210-A
|
34
|
3299.5
|
male
|
N
|
40
|
NaN
|
2013-01-07
|
1006
|
BEIJING
|
130-F
|
32
|
4432.0
|
female
|
Y
|
40
|
1.0
|
df_inner.loc[(df_inner['age']>25)&(df_inner['city']=='BEIJING'),['id','city','age','price','gender','pay','sign']]#与筛选
|
id
|
city
|
age
|
price
|
gender
|
pay
|
sign
|
date
|
|
|
|
|
|
|
|
2013-01-07
|
1006
|
BEIJING
|
32
|
4432.0
|
female
|
Y
|
1.0
|
df_inner.loc[(df_inner['age']>25)|(df_inner['city']=='BEIJING'),['id','city','age','price','gender','pay','sign']]#或筛选
|
id
|
city
|
age
|
price
|
gender
|
pay
|
sign
|
date
|
|
|
|
|
|
|
|
2013-01-02
|
1001
|
BEIJING
|
23
|
1200.0
|
female
|
N
|
NaN
|
2013-01-03
|
1002
|
SH
|
44
|
3299.5
|
male
|
Y
|
NaN
|
2013-01-04
|
1003
|
GUANGZHOU
|
54
|
2133.0
|
male
|
Y
|
NaN
|
2013-01-05
|
1004
|
SHENZHEN
|
32
|
5433.0
|
female
|
Y
|
NaN
|
2013-01-06
|
1005
|
SHANGHAI
|
34
|
3299.5
|
male
|
N
|
NaN
|
2013-01-07
|
1006
|
BEIJING
|
32
|
4432.0
|
female
|
Y
|
1.0
|
df_inner.loc[(df_inner['city']!='BEIJING'),['id','city','age','price','gender','pay','sign']]#非筛选
|
id
|
city
|
age
|
price
|
gender
|
pay
|
sign
|
date
|
|
|
|
|
|
|
|
2013-01-03
|
1002
|
SH
|
44
|
3299.5
|
male
|
Y
|
NaN
|
2013-01-04
|
1003
|
GUANGZHOU
|
54
|
2133.0
|
male
|
Y
|
NaN
|
2013-01-05
|
1004
|
SHENZHEN
|
32
|
5433.0
|
female
|
Y
|
NaN
|
2013-01-06
|
1005
|
SHANGHAI
|
34
|
3299.5
|
male
|
N
|
NaN
|
df_inner.loc[(df_inner['city']!='BEIJING'),['id','city','age','price','gender','pay','sign']].city.count()#筛选后的数据按照city数计数
4
df_inner.query('["BEIJING","SHANGHAI"]==city').price.sum()#使用query函数进行筛选,再求和
8931.5
数据汇总
df_inner.groupby('city').count()#对所有的列进行计数汇总
|
id
|
category
|
age
|
price
|
gender
|
pay
|
m-point
|
sign
|
city
|
|
|
|
|
|
|
|
|
BEIJING
|
2
|
2
|
2
|
2
|
2
|
2
|
2
|
1
|
GUANGZHOU
|
1
|
1
|
1
|
1
|
1
|
1
|
1
|
0
|
SH
|
1
|
1
|
1
|
1
|
1
|
1
|
1
|
0
|
SHANGHAI
|
1
|
1
|
1
|
1
|
1
|
1
|
1
|
0
|
SHENZHEN
|
1
|
1
|
1
|
1
|
1
|
1
|
1
|
0
|
df_inner.groupby('city')['id'].count()#按城市对id字段进行计数
city
BEIJING 2
GUANGZHOU 1
SH 1
SHANGHAI 1
SHENZHEN 1
Name: id, dtype: int64
df_inner.groupby(['city','price'])['id'].count()#对两个字段进行汇总计数
city price
BEIJING 1200.0 14432.0 1
GUANGZHOU 2133.0 1
SH 3299.5 1
SHANGHAI 3299.5 1
SHENZHEN 5433.0 1
Name: id, dtype: int64
df_inner.groupby('city')['price'].agg([len,np.sum,np.mean])#对city字段进行汇总,并分别计算prince的合计和均值
|
len
|
sum
|
mean
|
city
|
|
|
|
BEIJING
|
2.0
|
5632.0
|
2816.0
|
GUANGZHOU
|
1.0
|
2133.0
|
2133.0
|
SH
|
1.0
|
3299.5
|
3299.5
|
SHANGHAI
|
1.0
|
3299.5
|
3299.5
|
SHENZHEN
|
1.0
|
5433.0
|
5433.0
|
df_inner.sample(n=2)#采样
|
id
|
city
|
category
|
age
|
price
|
gender
|
pay
|
m-point
|
sign
|
date
|
|
|
|
|
|
|
|
|
|
2013-01-05
|
1004
|
SHENZHEN
|
110-C
|
32
|
5433.0
|
female
|
Y
|
40
|
NaN
|
2013-01-07
|
1006
|
BEIJING
|
130-F
|
32
|
4432.0
|
female
|
Y
|
40
|
1.0
|
weights = [0, 0, 0, 0, 0.5, 0.5]
df_inner.sample(n=2, weights=weights)#手动设置采样权重
|
id
|
city
|
category
|
age
|
price
|
gender
|
pay
|
m-point
|
sign
|
date
|
|
|
|
|
|
|
|
|
|
2013-01-07
|
1006
|
BEIJING
|
130-F
|
32
|
4432.0
|
female
|
Y
|
40
|
1.0
|
2013-01-06
|
1005
|
SHANGHAI
|
210-A
|
34
|
3299.5
|
male
|
N
|
40
|
NaN
|
df_inner.sample(n=6, replace=False) #采样不放回
|
id
|
city
|
category
|
age
|
price
|
gender
|
pay
|
m-point
|
sign
|
date
|
|
|
|
|
|
|
|
|
|
2013-01-04
|
1003
|
GUANGZHOU
|
110-A
|
54
|
2133.0
|
male
|
Y
|
20
|
NaN
|
2013-01-03
|
1002
|
SH
|
100-B
|
44
|
3299.5
|
male
|
Y
|
10
|
NaN
|
2013-01-02
|
1001
|
BEIJING
|
100-A
|
23
|
1200.0
|
female
|
N
|
12
|
NaN
|
2013-01-05
|
1004
|
SHENZHEN
|
110-C
|
32
|
5433.0
|
female
|
Y
|
40
|
NaN
|
2013-01-07
|
1006
|
BEIJING
|
130-F
|
32
|
4432.0
|
female
|
Y
|
40
|
1.0
|
2013-01-06
|
1005
|
SHANGHAI
|
210-A
|
34
|
3299.5
|
male
|
N
|
40
|
NaN
|
df_inner.describe().round(2).T #round函数设置显示小数位,T表示转置
|
count
|
mean
|
std
|
min
|
25%
|
50%
|
75%
|
max
|
id
|
6.0
|
1003.5
|
1.87
|
1001.0
|
1002.25
|
1003.5
|
1004.75
|
1006.0
|
age
|
6.0
|
36.5
|
10.88
|
23.0
|
32.00
|
33.0
|
41.50
|
54.0
|
price
|
6.0
|
3299.5
|
1523.35
|
1200.0
|
2424.62
|
3299.5
|
4148.88
|
5433.0
|
m-point
|
6.0
|
27.0
|
14.63
|
10.0
|
14.00
|
30.0
|
40.00
|
40.0
|
sign
|
1.0
|
1.0
|
NaN
|
1.0
|
1.00
|
1.0
|
1.00
|
1.0
|
df_inner['price'].std()#计算列的标准差
1523.3516337339847
df_inner['price'].cov(df_inner['m-point'])#计算两个字段间的协方差
16423.2
df_inner.cov()#数据表中所有字段间的协方差
|
id
|
age
|
price
|
m-point
|
sign
|
id
|
3.5
|
-0.7
|
1946.0
|
25.0
|
NaN
|
age
|
-0.7
|
118.3
|
-1353.5
|
-39.4
|
NaN
|
price
|
1946.0
|
-1353.5
|
2320600.2
|
16423.2
|
NaN
|
m-point
|
25.0
|
-39.4
|
16423.2
|
214.0
|
NaN
|
sign
|
NaN
|
NaN
|
NaN
|
NaN
|
NaN
|
df_inner.corr() #相关系数在-1到1之间,接近1为正相关,接近-1为负相关,0为不相关
|
id
|
age
|
price
|
m-point
|
sign
|
id
|
1.000000
|
-0.034401
|
0.682824
|
0.913480
|
NaN
|
age
|
-0.034401
|
1.000000
|
-0.081689
|
-0.247626
|
NaN
|
price
|
0.682824
|
-0.081689
|
1.000000
|
0.736972
|
NaN
|
m-point
|
0.913480
|
-0.247626
|
0.736972
|
1.000000
|
NaN
|
sign
|
NaN
|
NaN
|
NaN
|
NaN
|
NaN
|
df_inner['price'].corr(df_inner['m-point'])#df_inner['price'].corr(df_inner['m-point'])#
0.7369715663870097
df_inner.to_csv('df_inner.csv')#数据写入到csv
pandas用法总结相关推荐
- Python pandas用法
Python pandas用法 无味之味关注 12019.01.10 15:43:25字数 2,877阅读 91,914 介绍 在Python中,pandas是基于NumPy数组构建的,使数据预处理. ...
- 用python的pandas打开csv文件_python读写数据读写csv文件(pandas用法)
python中数据处理是比较方便的,经常用的就是读写文件,提取数据等,本博客主要介绍其中的一些用法.Pandas是一个强大的分析结构化数据的工具集;它的使用基础是Numpy(提供高性能的矩阵运算);用 ...
- pandas用法小结
前言 个人感觉网上对pandas的总结感觉不够详尽细致,在这里我对pandas做个相对细致的小结吧,在数据分析与人工智能方面会有所涉及到的东西在这里都说说吧,也是对自己学习的一种小结! pandas用 ...
- Pandas 用法总结
Pandas 用法总结 Pandas 简述: Pandas 是什么? Pandas是一个强大的分析结构化数据的工具集:它的使用基础是Numpy(提供高性能的矩阵运算):用于数据挖掘和数据分析,同时也提 ...
- python panda用法_Python3 pandas用法大全
Python3 pandas用法大全 一.生成数据表 1.首先导入pandas库,一般都会用到numpy库,所以我们先导入备用: importnumpy as npimport pandas as p ...
- 【机器学习基础】前置知识(四):一文掌握Pandas用法
Pandas提供快速,灵活和富于表现力的数据结构,是强大的数据分析Python库. 本文收录于机器学习前置教程系列. 一.Series和DataFrame Pandas建立在NumPy之上,更多Num ...
- python panda用法_Python Pandas用法入门
简介 首先pandas是基于numpy进行开发的. Pandas 的主要数据结构是 Series(一维数据)与 DataFrame(二维数据),这两种数据结构足以处理金融.统计.社会科学.工程等领域里 ...
- python pandas 分类汇总用法_python之pandas用法大全
一.生成数据表 1.首先导入pandas库,一般都会用到numpy库,所以我们先导入备用: import numpy as np import pandas as pd 2.导入CSV或者xlsx文件 ...
- Pandas教程(pandas用法)
Pandas 基本介绍 Numpy 和 Pandas 有什么不同 如果用 python 的列表和字典来作比较, 那么可以说 Numpy 是列表形式的,没有数值标签,而 Pandas 就是字典形式.Pa ...
- pandas用法详解
一.生成数据表 1.首先导入pandas库,一般都会用到numpy库,所以我们先导入备用: import numpy as np import pandas as pd 2.导入CSV或者xlsx文件 ...
最新文章
- 综合素质计算机能力,教资统考中学《综合素质》:信息处理能力(一)
- 「 每日一练,快乐水题 」744. 寻找比目标字母大的最小字母
- c#实现Socket网络编程
- android AlertDialog.Builder
- PHP做二次开发:ThinkCMF门户应用安装
- python10086查询系统_Python获取移动性能指标
- php全选帖子删除,用PHP实现全选全删
- 网络连接数4000多正常吗_怀孕36周时胎儿发育情况是怎样的?胎儿体重有4斤多正常吗?...
- Redis Cluster部署、管理和测试
- linux 使cpu使用率升高_linux性能优化
- Candence PCB Allegro①贴片封装绘制
- 《壁纸 - 大全精选手机壁纸》EULA条款协议
- Civil 3d中的mms文件
- python anova_在python中对GLM进行Anova测试
- dell服务器服务器数据丢失后,数据恢复
- 测试方法——因果图法和判定表法
- OSGEarth解决南北极空洞问题
- GNU的C++代码书写规范
- Java限流策略与算法
- 0705第七讲标准模版库
热门文章
- Nacos中namespace,groupId,dataId使用
- 基于pytorch+transformers的车牌识别
- HTTP协议和RPC协议
- TP5框架集成极验验证码
- IT职业教育(1) 北大青鸟APTECH 1
- Xposed模块开发教程
- Direct3D 10转型时代所面临的空前窘境
- 倍福控制器连接松下EtherCAT伺服注意事项
- jdk1.8 Switch 不能使用 String类型
- OpenProcessToken LookupPrivilegeValue 和AdjustTokenPrivilege