pandas用法总结

import numpy as np
import pandas as pd

#构造数据表
df = pd.DataFrame({"id":[1001,1002,1003,1004,1005,1006], "date":pd.date_range('20130102', periods=6),"city":['Beijing ', 'SH', ' guangzhou ', 'Shenzhen', 'shanghai', 'BEIJING '],"age":[23,44,54,32,34,32],"category":['100-A','100-B','110-A','110-C','210-A','130-F'],"price":[1200,np.nan,2133,5433,np.nan,4432]}, columns =['id','date','city','category','age','price'])
df

	id	date	city	category	age	price
0	1001	2013-01-02	Beijing	100-A	23	1200.0
1	1002	2013-01-03	SH	100-B	44	NaN
2	1003	2013-01-04	guangzhou	110-A	54	2133.0
3	1004	2013-01-05	Shenzhen	110-C	32	5433.0
4	1005	2013-01-06	shanghai	210-A	34	NaN
5	1006	2013-01-07	BEIJING	130-F	32	4432.0

df.shape#维度

(6, 6)

df.info()#维度、列名称、数据格式、所占空间等

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 6 columns):
id          6 non-null int64
date        6 non-null datetime64[ns]
city        6 non-null object
category    6 non-null object
age         6 non-null int64
price       4 non-null float64
dtypes: datetime64[ns](1), float64(1), int64(2), object(2)
memory usage: 368.0+ bytes

df.dtypes#每一列数据的格式

id                   int64
date        datetime64[ns]
city                object
category            object
age                  int64
price              float64
dtype: object

df['price'].dtype#格式

dtype('float64')

df['price'].isnull()#看空值

0    False
1     True
2    False
3    False
4     True
5    False
Name: price, dtype: bool

df['price'].unique()#查看某一列的唯一值
df['city'].unique()

array(['Beijing ', 'SH', ' guangzhou ', 'Shenzhen', 'shanghai','BEIJING '], dtype=object)

df['price'].value_counts()#计数值value_counts()

4432.0    1
5433.0    1
2133.0    1
1200.0    1
Name: price, dtype: int64

df.values#转numpy的ndarray格式

array([[1001, Timestamp('2013-01-02 00:00:00'), 'Beijing ', '100-A', 23,1200.0],[1002, Timestamp('2013-01-03 00:00:00'), 'SH', '100-B', 44, nan],[1003, Timestamp('2013-01-04 00:00:00'), ' guangzhou ', '110-A',54, 2133.0],[1004, Timestamp('2013-01-05 00:00:00'), 'Shenzhen', '110-C', 32,5433.0],[1005, Timestamp('2013-01-06 00:00:00'), 'shanghai', '210-A', 34,nan],[1006, Timestamp('2013-01-07 00:00:00'), 'BEIJING ', '130-F', 32,4432.0]], dtype=object)

df.columns#看列名称

Index(['id', 'date', 'city', 'category', 'age', 'price'], dtype='object')

df.head()#前5行

	id	date	city	category	age	price
0	1001	2013-01-02	Beijing	100-A	23	1200.0
1	1002	2013-01-03	SH	100-B	44	NaN
2	1003	2013-01-04	guangzhou	110-A	54	2133.0
3	1004	2013-01-05	Shenzhen	110-C	32	5433.0
4	1005	2013-01-06	shanghai	210-A	34	NaN

df.tail()#后五行

	id	date	city	category	age	price
1	1002	2013-01-03	SH	100-B	44	NaN
2	1003	2013-01-04	guangzhou	110-A	54	2133.0
3	1004	2013-01-05	Shenzhen	110-C	32	5433.0
4	1005	2013-01-06	shanghai	210-A	34	NaN
5	1006	2013-01-07	BEIJING	130-F	32	4432.0

数据表清洗

df.fillna(value=0)#0填充

	id	date	city	category	age	price
0	1001	2013-01-02	Beijing	100-A	23	1200.0
1	1002	2013-01-03	SH	100-B	44	0.0
2	1003	2013-01-04	guangzhou	110-A	54	2133.0
3	1004	2013-01-05	Shenzhen	110-C	32	5433.0
4	1005	2013-01-06	shanghai	210-A	34	0.0
5	1006	2013-01-07	BEIJING	130-F	32	4432.0

df['price']=df['price'].fillna(df['price'].mean())#均值填充

df

	id	date	city	category	age	price
0	1001	2013-01-02	Beijing	100-A	23	1200.0
1	1002	2013-01-03	SH	100-B	44	3299.5
2	1003	2013-01-04	guangzhou	110-A	54	2133.0
3	1004	2013-01-05	Shenzhen	110-C	32	5433.0
4	1005	2013-01-06	shanghai	210-A	34	3299.5
5	1006	2013-01-07	BEIJING	130-F	32	4432.0

print(df['city'])
df['city']=df['city'].map(str.strip)#清除city字段的字符空格，看Beijing 后边和guangzhou前边
print(df['city'])

0       Beijing
1             SH
2     guangzhou
3       Shenzhen
4       shanghai
5       BEIJING
Name: city, dtype: object
0      Beijing
1           SH
2    guangzhou
3     Shenzhen
4     shanghai
5      BEIJING
Name: city, dtype: object

df['city']=df['city'].str.upper()#转成全大写,lower小写
df

	id	date	city	category	age	price
0	1001	2013-01-02	BEIJING	100-A	23	1200.0
1	1002	2013-01-03	SH	100-B	44	3299.5
2	1003	2013-01-04	GUANGZHOU	110-A	54	2133.0
3	1004	2013-01-05	SHENZHEN	110-C	32	5433.0
4	1005	2013-01-06	SHANGHAI	210-A	34	3299.5
5	1006	2013-01-07	BEIJING	130-F	32	4432.0

df['price'].astype('int')#更改数据格式

0    1200
1    3299
2    2133
3    5433
4    3299
5    4432
Name: price, dtype: int32

df.rename(columns={'category': 'category-size'}) #更改列名称

	id	date	city	category-size	age	price
0	1001	2013-01-02	BEIJING	100-A	23	1200.0
1	1002	2013-01-03	SH	100-B	44	3299.5
2	1003	2013-01-04	GUANGZHOU	110-A	54	2133.0
3	1004	2013-01-05	SHENZHEN	110-C	32	5433.0
4	1005	2013-01-06	SHANGHAI	210-A	34	3299.5
5	1006	2013-01-07	BEIJING	130-F	32	4432.0

# df['city'].drop_duplicates()#删除后出现的重复值
# df

df['city'].replace('sh','shanghai')#数据替换

0      BEIJING
1           SH
2    GUANGZHOU
3     SHENZHEN
4     SHANGHAI
5      BEIJING
Name: city, dtype: object

数据预处理

df1=pd.DataFrame({"id":[1002,1001,1003,1004,1005,1006,1007,1008],
"gender":['male','female','male','female','male','female','male','female'],
"pay":['Y','N','Y','Y','N','Y','N','Y',],
"m-point":[10,12,20,40,40,40,30,20]})

df1

	id	gender	pay	m-point
0	1002	male	Y	10
1	1001	female	N	12
2	1003	male	Y	20
3	1004	female	Y	40
4	1005	male	N	40
5	1006	female	Y	40
6	1007	male	N	30
7	1008	female	Y	20

merge

# 匹配合并，交集,以id为标识
df_inner=pd.merge(df,df1,how='inner',on='id')
df_inner.head(10)

	id	date	city	category	age	price	gender	pay	m-point
0	1001	2013-01-02	BEIJING	100-A	23	1200.0	female	N	12
1	1002	2013-01-03	SH	100-B	44	3299.5	male	Y	10
2	1003	2013-01-04	GUANGZHOU	110-A	54	2133.0	male	Y	20
3	1004	2013-01-05	SHENZHEN	110-C	32	5433.0	female	Y	40
4	1005	2013-01-06	SHANGHAI	210-A	34	3299.5	male	N	40
5	1006	2013-01-07	BEIJING	130-F	32	4432.0	female	Y	40

# 匹配合并，左边为重
df_left=pd.merge(df,df1,how='left')
df_left

	id	date	city	category	age	price	gender	pay	m-point
0	1001	2013-01-02	BEIJING	100-A	23	1200.0	female	N	12
1	1002	2013-01-03	SH	100-B	44	3299.5	male	Y	10
2	1003	2013-01-04	GUANGZHOU	110-A	54	2133.0	male	Y	20
3	1004	2013-01-05	SHENZHEN	110-C	32	5433.0	female	Y	40
4	1005	2013-01-06	SHANGHAI	210-A	34	3299.5	male	N	40
5	1006	2013-01-07	BEIJING	130-F	32	4432.0	female	Y	40

# 匹配合并，右边为基准
df_right=pd.merge(df,df1,how='right')
df_right

	id	date	city	category	age	price	gender	pay	m-point
0	1001	2013-01-02	BEIJING	100-A	23.0	1200.0	female	N	12
1	1002	2013-01-03	SH	100-B	44.0	3299.5	male	Y	10
2	1003	2013-01-04	GUANGZHOU	110-A	54.0	2133.0	male	Y	20
3	1004	2013-01-05	SHENZHEN	110-C	32.0	5433.0	female	Y	40
4	1005	2013-01-06	SHANGHAI	210-A	34.0	3299.5	male	N	40
5	1006	2013-01-07	BEIJING	130-F	32.0	4432.0	female	Y	40
6	1007	NaT	NaN	NaN	NaN	NaN	male	N	30
7	1008	NaT	NaN	NaN	NaN	NaN	female	Y	20

# 匹配合并，并集
df_outer=pd.merge(df,df1,how='outer')
df_outer

	id	date	city	category	age	price	gender	pay	m-point
0	1001	2013-01-02	BEIJING	100-A	23.0	1200.0	female	N	12
1	1002	2013-01-03	SH	100-B	44.0	3299.5	male	Y	10
2	1003	2013-01-04	GUANGZHOU	110-A	54.0	2133.0	male	Y	20
3	1004	2013-01-05	SHENZHEN	110-C	32.0	5433.0	female	Y	40
4	1005	2013-01-06	SHANGHAI	210-A	34.0	3299.5	male	N	40
5	1006	2013-01-07	BEIJING	130-F	32.0	4432.0	female	Y	40
6	1007	NaT	NaN	NaN	NaN	NaN	male	N	30
7	1008	NaT	NaN	NaN	NaN	NaN	female	Y	20

append

result = df1.append(df)#上下相加
result

G:\Anaconda\lib\site-packages\pandas\core\frame.py:6211: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.To accept the future behavior, pass 'sort=False'.To retain the current behavior and silence the warning, pass 'sort=True'.sort=sort)

	age	category	city	date	gender	id	m-point	pay	price
0	NaN	NaN	NaN	NaT	male	1002	10.0	Y	NaN
1	NaN	NaN	NaN	NaT	female	1001	12.0	N	NaN
2	NaN	NaN	NaN	NaT	male	1003	20.0	Y	NaN
3	NaN	NaN	NaN	NaT	female	1004	40.0	Y	NaN
4	NaN	NaN	NaN	NaT	male	1005	40.0	N	NaN
5	NaN	NaN	NaN	NaT	female	1006	40.0	Y	NaN
6	NaN	NaN	NaN	NaT	male	1007	30.0	N	NaN
7	NaN	NaN	NaN	NaT	female	1008	20.0	Y	NaN
0	23.0	100-A	BEIJING	2013-01-02	NaN	1001	NaN	NaN	1200.0
1	44.0	100-B	SH	2013-01-03	NaN	1002	NaN	NaN	3299.5
2	54.0	110-A	GUANGZHOU	2013-01-04	NaN	1003	NaN	NaN	2133.0
3	32.0	110-C	SHENZHEN	2013-01-05	NaN	1004	NaN	NaN	5433.0
4	34.0	210-A	SHANGHAI	2013-01-06	NaN	1005	NaN	NaN	3299.5
5	32.0	130-F	BEIJING	2013-01-07	NaN	1006	NaN	NaN	4432.0

join

left=pd.DataFrame({'a':[1,2,3],'b':[4,6,7]})
left

	a	b
0	1	4
1	2	6
2	3	7

right=pd.DataFrame({'c':[1,1,1],'d':[2,2,2]})
right

	c	d
0	1	2
1	1	2
2	1	2

result=left.join(right,how='inner')
result

	a	b	c	d
0	1	4	1	2
1	2	6	1	2
2	3	7	1	2

concat

frames=[left,right,result]
result1=pd.concat(frames)
result1

G:\Anaconda\lib\site-packages\ipykernel_launcher.py:2: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.To accept the future behavior, pass 'sort=False'.To retain the current behavior and silence the warning, pass 'sort=True'.

	a	b	c	d
0	1.0	4.0	NaN	NaN
1	2.0	6.0	NaN	NaN
2	3.0	7.0	NaN	NaN
0	NaN	NaN	1.0	2.0
1	NaN	NaN	1.0	2.0
2	NaN	NaN	1.0	2.0
0	1.0	4.0	1.0	2.0
1	2.0	6.0	1.0	2.0
2	3.0	7.0	1.0	2.0

df

	id	date	city	category	age	price
0	1001	2013-01-02	BEIJING	100-A	23	1200.0
1	1002	2013-01-03	SH	100-B	44	3299.5
2	1003	2013-01-04	GUANGZHOU	110-A	54	2133.0
3	1004	2013-01-05	SHENZHEN	110-C	32	5433.0
4	1005	2013-01-06	SHANGHAI	210-A	34	3299.5
5	1006	2013-01-07	BEIJING	130-F	32	4432.0

df_inner.set_index('id')#设置id为索引列

	date	city	category	age	price	gender	pay	m-point
id
1001	2013-01-02	BEIJING	100-A	23	1200.0	female	N	12
1002	2013-01-03	SH	100-B	44	3299.5	male	Y	10
1003	2013-01-04	GUANGZHOU	110-A	54	2133.0	male	Y	20
1004	2013-01-05	SHENZHEN	110-C	32	5433.0	female	Y	40
1005	2013-01-06	SHANGHAI	210-A	34	3299.5	male	N	40
1006	2013-01-07	BEIJING	130-F	32	4432.0	female	Y	40

df.sort_values(by=['age'])#按照特定列的值排序,age

	id	date	city	category	age	price
0	1001	2013-01-02	BEIJING	100-A	23	1200.0
3	1004	2013-01-05	SHENZHEN	110-C	32	5433.0
5	1006	2013-01-07	BEIJING	130-F	32	4432.0
4	1005	2013-01-06	SHANGHAI	210-A	34	3299.5
1	1002	2013-01-03	SH	100-B	44	3299.5
2	1003	2013-01-04	GUANGZHOU	110-A	54	2133.0

df.sort_index()#按照索引列排序

	id	date	city	category	age	price
0	1001	2013-01-02	BEIJING	100-A	23	1200.0
1	1002	2013-01-03	SH	100-B	44	3299.5
2	1003	2013-01-04	GUANGZHOU	110-A	54	2133.0
3	1004	2013-01-05	SHENZHEN	110-C	32	5433.0
4	1005	2013-01-06	SHANGHAI	210-A	34	3299.5
5	1006	2013-01-07	BEIJING	130-F	32	4432.0

df['group']=np.where(df['price']>3000,'high','low')
df
#如果prince列的值>3000，group列显示high，否则显示low

	id	date	city	category	age	price	group
0	1001	2013-01-02	BEIJING	100-A	23	1200.0	low
1	1002	2013-01-03	SH	100-B	44	3299.5	high
2	1003	2013-01-04	GUANGZHOU	110-A	54	2133.0	low
3	1004	2013-01-05	SHENZHEN	110-C	32	5433.0	high
4	1005	2013-01-06	SHANGHAI	210-A	34	3299.5	high
5	1006	2013-01-07	BEIJING	130-F	32	4432.0	high

df_inner

	id	date	city	category	age	price	gender	pay	m-point
0	1001	2013-01-02	BEIJING	100-A	23	1200.0	female	N	12
1	1002	2013-01-03	SH	100-B	44	3299.5	male	Y	10
2	1003	2013-01-04	GUANGZHOU	110-A	54	2133.0	male	Y	20
3	1004	2013-01-05	SHENZHEN	110-C	32	5433.0	female	Y	40
4	1005	2013-01-06	SHANGHAI	210-A	34	3299.5	male	N	40
5	1006	2013-01-07	BEIJING	130-F	32	4432.0	female	Y	40

#对复合多个条件的数据进行分组标记
df_inner.loc[(df_inner['city']=='BEIJING')&(df_inner['price']>=4000),'sign']=1
df_inner

	id	date	city	category	age	price	gender	pay	m-point	sign
0	1001	2013-01-02	BEIJING	100-A	23	1200.0	female	N	12	NaN
1	1002	2013-01-03	SH	100-B	44	3299.5	male	Y	10	NaN
2	1003	2013-01-04	GUANGZHOU	110-A	54	2133.0	male	Y	20	NaN
3	1004	2013-01-05	SHENZHEN	110-C	32	5433.0	female	Y	40	NaN
4	1005	2013-01-06	SHANGHAI	210-A	34	3299.5	male	N	40	NaN
5	1006	2013-01-07	BEIJING	130-F	32	4432.0	female	Y	40	1.0

# pd.DataFrame((x.split('-')for x in df_inner['category']),index=df_inner.index,columns=['category','size'])

数据提取 loc,iloc和ix,loc函数按标签值进行提取，iloc按位置进行提取，ix可以同时按标签和位置进行提取.

df_inner.loc[3]#按索引提取单行的数值

id                         1004
date        2013-01-05 00:00:00
city                   SHENZHEN
category                  110-C
age                          32
price                      5433
gender                   female
pay                           Y
m-point                      40
sign                        NaN
Name: 3, dtype: object

df_inner.iloc[0:5]#按索引提取区域行数值

	id	date	city	category	age	price	gender	pay	m-point	sign
0	1001	2013-01-02	BEIJING	100-A	23	1200.0	female	N	12	NaN
1	1002	2013-01-03	SH	100-B	44	3299.5	male	Y	10	NaN
2	1003	2013-01-04	GUANGZHOU	110-A	54	2133.0	male	Y	20	NaN
3	1004	2013-01-05	SHENZHEN	110-C	32	5433.0	female	Y	40	NaN
4	1005	2013-01-06	SHANGHAI	210-A	34	3299.5	male	N	40	NaN

df_inner=df_inner.set_index('date') #设置日期为索引
df_inner.iloc[[0,2,5],[4,5]]#提取第0、2、5行，4、5列

	price	gender
date
2013-01-02	1200.0	female
2013-01-04	2133.0	male
2013-01-07	4432.0	female

df_inner.iloc[:3,:2]#冒号前后的数字不再是索引的标签名称，而是数据所在的位置，从0开始，前三行，前两列

	id	city
date
2013-01-02	1001	BEIJING
2013-01-03	1002	SH
2013-01-04	1003	GUANGZHOU

df_inner.ix[:'2013-01-03',:4]#使用ix按索引标签和位置混合提取数据,2013-01-03号之前，前四列数据

G:\Anaconda\lib\site-packages\ipykernel_launcher.py:1: DeprecationWarning:
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexingSee the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated"""Entry point for launching an IPython kernel.

	id	city	category	age
date
2013-01-02	1001	BEIJING	100-A	23
2013-01-03	1002	SH	100-B	44

df_inner.loc[df_inner['city'].isin(['BEIJING','SHANGHAI'])]#判断city列里是否包含beijing和shanghai，然后将符合条件的数据提取出来

	id	city	category	age	price	gender	pay	m-point	sign
date
2013-01-02	1001	BEIJING	100-A	23	1200.0	female	N	12	NaN
2013-01-06	1005	SHANGHAI	210-A	34	3299.5	male	N	40	NaN
2013-01-07	1006	BEIJING	130-F	32	4432.0	female	Y	40	1.0

pd.DataFrame(df_inner["category"].str[:3])#提取前三个字符，并生成数据表

	category
date
2013-01-02	100
2013-01-03	100
2013-01-04	110
2013-01-05	110
2013-01-06	210
2013-01-07	130

pd.DataFrame(df_inner["category"].str[-1])#提取最后一个字符，并生成数据表

	category
date
2013-01-02	A
2013-01-03	B
2013-01-04	A
2013-01-05	C
2013-01-06	A
2013-01-07	F

数据筛选

使用与、或、非三个条件配合大于、小于、等于对数据进行筛选，并进行计数和求和

df_inner

	id	city	category	age	price	gender	pay	m-point	sign
date
2013-01-02	1001	BEIJING	100-A	23	1200.0	female	N	12	NaN
2013-01-03	1002	SH	100-B	44	3299.5	male	Y	10	NaN
2013-01-04	1003	GUANGZHOU	110-A	54	2133.0	male	Y	20	NaN
2013-01-05	1004	SHENZHEN	110-C	32	5433.0	female	Y	40	NaN
2013-01-06	1005	SHANGHAI	210-A	34	3299.5	male	N	40	NaN
2013-01-07	1006	BEIJING	130-F	32	4432.0	female	Y	40	1.0

df_inner.loc[(df_inner['age']>25)&(df_inner['city']=='BEIJING'),['id','city','age','price','gender','pay','sign']]#与筛选

	id	city	age	price	gender	pay	sign
date
2013-01-07	1006	BEIJING	32	4432.0	female	Y	1.0

df_inner.loc[(df_inner['age']>25)|(df_inner['city']=='BEIJING'),['id','city','age','price','gender','pay','sign']]#或筛选

	id	city	age	price	gender	pay	sign
date
2013-01-02	1001	BEIJING	23	1200.0	female	N	NaN
2013-01-03	1002	SH	44	3299.5	male	Y	NaN
2013-01-04	1003	GUANGZHOU	54	2133.0	male	Y	NaN
2013-01-05	1004	SHENZHEN	32	5433.0	female	Y	NaN
2013-01-06	1005	SHANGHAI	34	3299.5	male	N	NaN
2013-01-07	1006	BEIJING	32	4432.0	female	Y	1.0

df_inner.loc[(df_inner['city']!='BEIJING'),['id','city','age','price','gender','pay','sign']]#非筛选

	id	city	age	price	gender	pay	sign
date
2013-01-03	1002	SH	44	3299.5	male	Y	NaN
2013-01-04	1003	GUANGZHOU	54	2133.0	male	Y	NaN
2013-01-05	1004	SHENZHEN	32	5433.0	female	Y	NaN
2013-01-06	1005	SHANGHAI	34	3299.5	male	N	NaN

df_inner.loc[(df_inner['city']!='BEIJING'),['id','city','age','price','gender','pay','sign']].city.count()#筛选后的数据按照city数计数

df_inner.query('["BEIJING","SHANGHAI"]==city').price.sum()#使用query函数进行筛选,再求和

8931.5

数据汇总

df_inner.groupby('city').count()#对所有的列进行计数汇总

	id	category	age	price	gender	pay	m-point	sign
city
BEIJING	2	2	2	2	2	2	2	1
GUANGZHOU	1	1	1	1	1	1	1	0
SH	1	1	1	1	1	1	1	0
SHANGHAI	1	1	1	1	1	1	1	0
SHENZHEN	1	1	1	1	1	1	1	0

df_inner.groupby('city')['id'].count()#按城市对id字段进行计数

city
BEIJING      2
GUANGZHOU    1
SH           1
SHANGHAI     1
SHENZHEN     1
Name: id, dtype: int64

df_inner.groupby(['city','price'])['id'].count()#对两个字段进行汇总计数

city       price
BEIJING    1200.0    14432.0    1
GUANGZHOU  2133.0    1
SH         3299.5    1
SHANGHAI   3299.5    1
SHENZHEN   5433.0    1
Name: id, dtype: int64

df_inner.groupby('city')['price'].agg([len,np.sum,np.mean])#对city字段进行汇总，并分别计算prince的合计和均值

	len	sum	mean
city
BEIJING	2.0	5632.0	2816.0
GUANGZHOU	1.0	2133.0	2133.0
SH	1.0	3299.5	3299.5
SHANGHAI	1.0	3299.5	3299.5
SHENZHEN	1.0	5433.0	5433.0

df_inner.sample(n=2)#采样

	id	city	category	age	price	gender	pay	m-point	sign
date
2013-01-05	1004	SHENZHEN	110-C	32	5433.0	female	Y	40	NaN
2013-01-07	1006	BEIJING	130-F	32	4432.0	female	Y	40	1.0

weights = [0, 0, 0, 0, 0.5, 0.5]
df_inner.sample(n=2, weights=weights)#手动设置采样权重

	id	city	category	age	price	gender	pay	m-point	sign
date
2013-01-07	1006	BEIJING	130-F	32	4432.0	female	Y	40	1.0
2013-01-06	1005	SHANGHAI	210-A	34	3299.5	male	N	40	NaN

df_inner.sample(n=6, replace=False) #采样不放回

	id	city	category	age	price	gender	pay	m-point	sign
date
2013-01-04	1003	GUANGZHOU	110-A	54	2133.0	male	Y	20	NaN
2013-01-03	1002	SH	100-B	44	3299.5	male	Y	10	NaN
2013-01-02	1001	BEIJING	100-A	23	1200.0	female	N	12	NaN
2013-01-05	1004	SHENZHEN	110-C	32	5433.0	female	Y	40	NaN
2013-01-07	1006	BEIJING	130-F	32	4432.0	female	Y	40	1.0
2013-01-06	1005	SHANGHAI	210-A	34	3299.5	male	N	40	NaN

df_inner.describe().round(2).T #round函数设置显示小数位，T表示转置

	count	mean	std	min	25%	50%	75%	max
id	6.0	1003.5	1.87	1001.0	1002.25	1003.5	1004.75	1006.0
age	6.0	36.5	10.88	23.0	32.00	33.0	41.50	54.0
price	6.0	3299.5	1523.35	1200.0	2424.62	3299.5	4148.88	5433.0
m-point	6.0	27.0	14.63	10.0	14.00	30.0	40.00	40.0
sign	1.0	1.0	NaN	1.0	1.00	1.0	1.00	1.0

df_inner['price'].std()#计算列的标准差

1523.3516337339847

df_inner['price'].cov(df_inner['m-point'])#计算两个字段间的协方差

16423.2

df_inner.cov()#数据表中所有字段间的协方差

	id	age	price	m-point	sign
id	3.5	-0.7	1946.0	25.0	NaN
age	-0.7	118.3	-1353.5	-39.4	NaN
price	1946.0	-1353.5	2320600.2	16423.2	NaN
m-point	25.0	-39.4	16423.2	214.0	NaN
sign	NaN	NaN	NaN	NaN	NaN

df_inner.corr() #相关系数在-1到1之间，接近1为正相关，接近-1为负相关，0为不相关

	id	age	price	m-point	sign
id	1.000000	-0.034401	0.682824	0.913480	NaN
age	-0.034401	1.000000	-0.081689	-0.247626	NaN
price	0.682824	-0.081689	1.000000	0.736972	NaN
m-point	0.913480	-0.247626	0.736972	1.000000	NaN
sign	NaN	NaN	NaN	NaN	NaN

df_inner['price'].corr(df_inner['m-point'])#df_inner['price'].corr(df_inner['m-point'])#

0.7369715663870097

df_inner.to_csv('df_inner.csv')#数据写入到csv

pandas用法总结相关推荐

Python pandas用法
Python pandas用法无味之味关注 12019.01.10 15:43:25字数 2,877阅读 91,914 介绍在Python中,pandas是基于NumPy数组构建的,使数据预处理. ...
用python的pandas打开csv文件_python读写数据读写csv文件(pandas用法)
python中数据处理是比较方便的,经常用的就是读写文件,提取数据等,本博客主要介绍其中的一些用法.Pandas是一个强大的分析结构化数据的工具集;它的使用基础是Numpy(提供高性能的矩阵运算);用 ...
pandas用法小结
前言个人感觉网上对pandas的总结感觉不够详尽细致,在这里我对pandas做个相对细致的小结吧,在数据分析与人工智能方面会有所涉及到的东西在这里都说说吧,也是对自己学习的一种小结! pandas用 ...
Pandas 用法总结
Pandas 用法总结 Pandas 简述: Pandas 是什么? Pandas是一个强大的分析结构化数据的工具集:它的使用基础是Numpy(提供高性能的矩阵运算):用于数据挖掘和数据分析,同时也提 ...
python panda用法_Python3 pandas用法大全
Python3 pandas用法大全一.生成数据表 1.首先导入pandas库,一般都会用到numpy库,所以我们先导入备用: importnumpy as npimport pandas as p ...
【机器学习基础】前置知识（四）：一文掌握Pandas用法
Pandas提供快速,灵活和富于表现力的数据结构,是强大的数据分析Python库. 本文收录于机器学习前置教程系列. 一.Series和DataFrame Pandas建立在NumPy之上,更多Num ...
python panda用法_Python Pandas用法入门
简介首先pandas是基于numpy进行开发的. Pandas 的主要数据结构是 Series(一维数据)与 DataFrame(二维数据),这两种数据结构足以处理金融.统计.社会科学.工程等领域里 ...
python pandas 分类汇总用法_python之pandas用法大全
一.生成数据表 1.首先导入pandas库,一般都会用到numpy库,所以我们先导入备用: import numpy as np import pandas as pd 2.导入CSV或者xlsx文件 ...
Pandas教程（pandas用法）
Pandas 基本介绍 Numpy 和 Pandas 有什么不同如果用 python 的列表和字典来作比较, 那么可以说 Numpy 是列表形式的,没有数值标签,而 Pandas 就是字典形式.Pa ...
pandas用法详解
一.生成数据表 1.首先导入pandas库,一般都会用到numpy库,所以我们先导入备用: import numpy as np import pandas as pd 2.导入CSV或者xlsx文件 ...

pandas用法总结

pandas用法总结相关推荐

最新文章

热门文章

	a	b	c	d
0	1.0	4.0	NaN	NaN
1	2.0	6.0	NaN	NaN
2	3.0	7.0	NaN	NaN
0	NaN	NaN	1.0	2.0
1	NaN	NaN	1.0	2.0
2	NaN	NaN	1.0	2.0
0	1.0	4.0	1.0	2.0
1	2.0	6.0	1.0	2.0
2	3.0	7.0	1.0	2.0

	a	b	c	d
0	1.0	4.0	NaN	NaN
1	2.0	6.0	NaN	NaN
2	3.0	7.0	NaN	NaN
0	NaN	NaN	1.0	2.0
1	NaN	NaN	1.0	2.0
2	NaN	NaN	1.0	2.0
0	1.0	4.0	1.0	2.0
1	2.0	6.0	1.0	2.0
2	3.0	7.0	1.0	2.0

	a	b	c	d
0	1.0	4.0	NaN	NaN
1	2.0	6.0	NaN	NaN
2	3.0	7.0	NaN	NaN
0	NaN	NaN	1.0	2.0
1	NaN	NaN	1.0	2.0
2	NaN	NaN	1.0	2.0
0	1.0	4.0	1.0	2.0
1	2.0	6.0	1.0	2.0
2	3.0	7.0	1.0	2.0