import numpy as np
import pandas as pd
#构造数据表
df = pd.DataFrame({"id":[1001,1002,1003,1004,1005,1006], "date":pd.date_range('20130102', periods=6),"city":['Beijing ', 'SH', ' guangzhou ', 'Shenzhen', 'shanghai', 'BEIJING '],"age":[23,44,54,32,34,32],"category":['100-A','100-B','110-A','110-C','210-A','130-F'],"price":[1200,np.nan,2133,5433,np.nan,4432]}, columns =['id','date','city','category','age','price'])
df
id date city category age price
0 1001 2013-01-02 Beijing 100-A 23 1200.0
1 1002 2013-01-03 SH 100-B 44 NaN
2 1003 2013-01-04 guangzhou 110-A 54 2133.0
3 1004 2013-01-05 Shenzhen 110-C 32 5433.0
4 1005 2013-01-06 shanghai 210-A 34 NaN
5 1006 2013-01-07 BEIJING 130-F 32 4432.0
df.shape#维度
(6, 6)
df.info()#维度、列名称、数据格式、所占空间等
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 6 columns):
id          6 non-null int64
date        6 non-null datetime64[ns]
city        6 non-null object
category    6 non-null object
age         6 non-null int64
price       4 non-null float64
dtypes: datetime64[ns](1), float64(1), int64(2), object(2)
memory usage: 368.0+ bytes
df.dtypes#每一列数据的格式
id                   int64
date        datetime64[ns]
city                object
category            object
age                  int64
price              float64
dtype: object
df['price'].dtype#格式
dtype('float64')
df['price'].isnull()#看空值
0    False
1     True
2    False
3    False
4     True
5    False
Name: price, dtype: bool
df['price'].unique()#查看某一列的唯一值
df['city'].unique()
array(['Beijing ', 'SH', ' guangzhou ', 'Shenzhen', 'shanghai','BEIJING '], dtype=object)
df['price'].value_counts()#计数值value_counts()
4432.0    1
5433.0    1
2133.0    1
1200.0    1
Name: price, dtype: int64
df.values#转numpy的ndarray格式
array([[1001, Timestamp('2013-01-02 00:00:00'), 'Beijing ', '100-A', 23,1200.0],[1002, Timestamp('2013-01-03 00:00:00'), 'SH', '100-B', 44, nan],[1003, Timestamp('2013-01-04 00:00:00'), ' guangzhou ', '110-A',54, 2133.0],[1004, Timestamp('2013-01-05 00:00:00'), 'Shenzhen', '110-C', 32,5433.0],[1005, Timestamp('2013-01-06 00:00:00'), 'shanghai', '210-A', 34,nan],[1006, Timestamp('2013-01-07 00:00:00'), 'BEIJING ', '130-F', 32,4432.0]], dtype=object)
df.columns#看列名称
Index(['id', 'date', 'city', 'category', 'age', 'price'], dtype='object')
df.head()#前5行
id date city category age price
0 1001 2013-01-02 Beijing 100-A 23 1200.0
1 1002 2013-01-03 SH 100-B 44 NaN
2 1003 2013-01-04 guangzhou 110-A 54 2133.0
3 1004 2013-01-05 Shenzhen 110-C 32 5433.0
4 1005 2013-01-06 shanghai 210-A 34 NaN
df.tail()#后五行
id date city category age price
1 1002 2013-01-03 SH 100-B 44 NaN
2 1003 2013-01-04 guangzhou 110-A 54 2133.0
3 1004 2013-01-05 Shenzhen 110-C 32 5433.0
4 1005 2013-01-06 shanghai 210-A 34 NaN
5 1006 2013-01-07 BEIJING 130-F 32 4432.0

数据表清洗

df.fillna(value=0)#0填充
id date city category age price
0 1001 2013-01-02 Beijing 100-A 23 1200.0
1 1002 2013-01-03 SH 100-B 44 0.0
2 1003 2013-01-04 guangzhou 110-A 54 2133.0
3 1004 2013-01-05 Shenzhen 110-C 32 5433.0
4 1005 2013-01-06 shanghai 210-A 34 0.0
5 1006 2013-01-07 BEIJING 130-F 32 4432.0
df['price']=df['price'].fillna(df['price'].mean())#均值填充
df
id date city category age price
0 1001 2013-01-02 Beijing 100-A 23 1200.0
1 1002 2013-01-03 SH 100-B 44 3299.5
2 1003 2013-01-04 guangzhou 110-A 54 2133.0
3 1004 2013-01-05 Shenzhen 110-C 32 5433.0
4 1005 2013-01-06 shanghai 210-A 34 3299.5
5 1006 2013-01-07 BEIJING 130-F 32 4432.0
print(df['city'])
df['city']=df['city'].map(str.strip)#清除city字段的字符空格,看Beijing 后边和guangzhou前边
print(df['city'])
0       Beijing
1             SH
2     guangzhou
3       Shenzhen
4       shanghai
5       BEIJING
Name: city, dtype: object
0      Beijing
1           SH
2    guangzhou
3     Shenzhen
4     shanghai
5      BEIJING
Name: city, dtype: object
df['city']=df['city'].str.upper()#转成全大写,lower小写
df
id date city category age price
0 1001 2013-01-02 BEIJING 100-A 23 1200.0
1 1002 2013-01-03 SH 100-B 44 3299.5
2 1003 2013-01-04 GUANGZHOU 110-A 54 2133.0
3 1004 2013-01-05 SHENZHEN 110-C 32 5433.0
4 1005 2013-01-06 SHANGHAI 210-A 34 3299.5
5 1006 2013-01-07 BEIJING 130-F 32 4432.0
df['price'].astype('int')#更改数据格式
0    1200
1    3299
2    2133
3    5433
4    3299
5    4432
Name: price, dtype: int32
df.rename(columns={'category': 'category-size'}) #更改列名称
id date city category-size age price
0 1001 2013-01-02 BEIJING 100-A 23 1200.0
1 1002 2013-01-03 SH 100-B 44 3299.5
2 1003 2013-01-04 GUANGZHOU 110-A 54 2133.0
3 1004 2013-01-05 SHENZHEN 110-C 32 5433.0
4 1005 2013-01-06 SHANGHAI 210-A 34 3299.5
5 1006 2013-01-07 BEIJING 130-F 32 4432.0
# df['city'].drop_duplicates()#删除后出现的重复值
# df
df['city'].replace('sh','shanghai')#数据替换
0      BEIJING
1           SH
2    GUANGZHOU
3     SHENZHEN
4     SHANGHAI
5      BEIJING
Name: city, dtype: object

数据预处理

df1=pd.DataFrame({"id":[1002,1001,1003,1004,1005,1006,1007,1008],
"gender":['male','female','male','female','male','female','male','female'],
"pay":['Y','N','Y','Y','N','Y','N','Y',],
"m-point":[10,12,20,40,40,40,30,20]})
df1
id gender pay m-point
0 1002 male Y 10
1 1001 female N 12
2 1003 male Y 20
3 1004 female Y 40
4 1005 male N 40
5 1006 female Y 40
6 1007 male N 30
7 1008 female Y 20

merge

# 匹配合并,交集,以id为标识
df_inner=pd.merge(df,df1,how='inner',on='id')
df_inner.head(10)
id date city category age price gender pay m-point
0 1001 2013-01-02 BEIJING 100-A 23 1200.0 female N 12
1 1002 2013-01-03 SH 100-B 44 3299.5 male Y 10
2 1003 2013-01-04 GUANGZHOU 110-A 54 2133.0 male Y 20
3 1004 2013-01-05 SHENZHEN 110-C 32 5433.0 female Y 40
4 1005 2013-01-06 SHANGHAI 210-A 34 3299.5 male N 40
5 1006 2013-01-07 BEIJING 130-F 32 4432.0 female Y 40
# 匹配合并,左边为重
df_left=pd.merge(df,df1,how='left')
df_left
id date city category age price gender pay m-point
0 1001 2013-01-02 BEIJING 100-A 23 1200.0 female N 12
1 1002 2013-01-03 SH 100-B 44 3299.5 male Y 10
2 1003 2013-01-04 GUANGZHOU 110-A 54 2133.0 male Y 20
3 1004 2013-01-05 SHENZHEN 110-C 32 5433.0 female Y 40
4 1005 2013-01-06 SHANGHAI 210-A 34 3299.5 male N 40
5 1006 2013-01-07 BEIJING 130-F 32 4432.0 female Y 40
# 匹配合并,右边为基准
df_right=pd.merge(df,df1,how='right')
df_right
id date city category age price gender pay m-point
0 1001 2013-01-02 BEIJING 100-A 23.0 1200.0 female N 12
1 1002 2013-01-03 SH 100-B 44.0 3299.5 male Y 10
2 1003 2013-01-04 GUANGZHOU 110-A 54.0 2133.0 male Y 20
3 1004 2013-01-05 SHENZHEN 110-C 32.0 5433.0 female Y 40
4 1005 2013-01-06 SHANGHAI 210-A 34.0 3299.5 male N 40
5 1006 2013-01-07 BEIJING 130-F 32.0 4432.0 female Y 40
6 1007 NaT NaN NaN NaN NaN male N 30
7 1008 NaT NaN NaN NaN NaN female Y 20
# 匹配合并,并集
df_outer=pd.merge(df,df1,how='outer')
df_outer
id date city category age price gender pay m-point
0 1001 2013-01-02 BEIJING 100-A 23.0 1200.0 female N 12
1 1002 2013-01-03 SH 100-B 44.0 3299.5 male Y 10
2 1003 2013-01-04 GUANGZHOU 110-A 54.0 2133.0 male Y 20
3 1004 2013-01-05 SHENZHEN 110-C 32.0 5433.0 female Y 40
4 1005 2013-01-06 SHANGHAI 210-A 34.0 3299.5 male N 40
5 1006 2013-01-07 BEIJING 130-F 32.0 4432.0 female Y 40
6 1007 NaT NaN NaN NaN NaN male N 30
7 1008 NaT NaN NaN NaN NaN female Y 20

append

result = df1.append(df)#上下相加
result
G:\Anaconda\lib\site-packages\pandas\core\frame.py:6211: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.To accept the future behavior, pass 'sort=False'.To retain the current behavior and silence the warning, pass 'sort=True'.sort=sort)
age category city date gender id m-point pay price
0 NaN NaN NaN NaT male 1002 10.0 Y NaN
1 NaN NaN NaN NaT female 1001 12.0 N NaN
2 NaN NaN NaN NaT male 1003 20.0 Y NaN
3 NaN NaN NaN NaT female 1004 40.0 Y NaN
4 NaN NaN NaN NaT male 1005 40.0 N NaN
5 NaN NaN NaN NaT female 1006 40.0 Y NaN
6 NaN NaN NaN NaT male 1007 30.0 N NaN
7 NaN NaN NaN NaT female 1008 20.0 Y NaN
0 23.0 100-A BEIJING 2013-01-02 NaN 1001 NaN NaN 1200.0
1 44.0 100-B SH 2013-01-03 NaN 1002 NaN NaN 3299.5
2 54.0 110-A GUANGZHOU 2013-01-04 NaN 1003 NaN NaN 2133.0
3 32.0 110-C SHENZHEN 2013-01-05 NaN 1004 NaN NaN 5433.0
4 34.0 210-A SHANGHAI 2013-01-06 NaN 1005 NaN NaN 3299.5
5 32.0 130-F BEIJING 2013-01-07 NaN 1006 NaN NaN 4432.0

join

left=pd.DataFrame({'a':[1,2,3],'b':[4,6,7]})
left
a b
0 1 4
1 2 6
2 3 7
right=pd.DataFrame({'c':[1,1,1],'d':[2,2,2]})
right
c d
0 1 2
1 1 2
2 1 2
result=left.join(right,how='inner')
result
a b c d
0 1 4 1 2
1 2 6 1 2
2 3 7 1 2

concat

frames=[left,right,result]
result1=pd.concat(frames)
result1
G:\Anaconda\lib\site-packages\ipykernel_launcher.py:2: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.To accept the future behavior, pass 'sort=False'.To retain the current behavior and silence the warning, pass 'sort=True'.
a b c d
0 1.0 4.0 NaN NaN
1 2.0 6.0 NaN NaN
2 3.0 7.0 NaN NaN
0 NaN NaN 1.0 2.0
1 NaN NaN 1.0 2.0
2 NaN NaN 1.0 2.0
0 1.0 4.0 1.0 2.0
1 2.0 6.0 1.0 2.0
2 3.0 7.0 1.0 2.0
df
id date city category age price
0 1001 2013-01-02 BEIJING 100-A 23 1200.0
1 1002 2013-01-03 SH 100-B 44 3299.5
2 1003 2013-01-04 GUANGZHOU 110-A 54 2133.0
3 1004 2013-01-05 SHENZHEN 110-C 32 5433.0
4 1005 2013-01-06 SHANGHAI 210-A 34 3299.5
5 1006 2013-01-07 BEIJING 130-F 32 4432.0
df_inner.set_index('id')#设置id为索引列
date city category age price gender pay m-point
id
1001 2013-01-02 BEIJING 100-A 23 1200.0 female N 12
1002 2013-01-03 SH 100-B 44 3299.5 male Y 10
1003 2013-01-04 GUANGZHOU 110-A 54 2133.0 male Y 20
1004 2013-01-05 SHENZHEN 110-C 32 5433.0 female Y 40
1005 2013-01-06 SHANGHAI 210-A 34 3299.5 male N 40
1006 2013-01-07 BEIJING 130-F 32 4432.0 female Y 40
df.sort_values(by=['age'])#按照特定列的值排序,age
id date city category age price
0 1001 2013-01-02 BEIJING 100-A 23 1200.0
3 1004 2013-01-05 SHENZHEN 110-C 32 5433.0
5 1006 2013-01-07 BEIJING 130-F 32 4432.0
4 1005 2013-01-06 SHANGHAI 210-A 34 3299.5
1 1002 2013-01-03 SH 100-B 44 3299.5
2 1003 2013-01-04 GUANGZHOU 110-A 54 2133.0
df.sort_index()#按照索引列排序
id date city category age price
0 1001 2013-01-02 BEIJING 100-A 23 1200.0
1 1002 2013-01-03 SH 100-B 44 3299.5
2 1003 2013-01-04 GUANGZHOU 110-A 54 2133.0
3 1004 2013-01-05 SHENZHEN 110-C 32 5433.0
4 1005 2013-01-06 SHANGHAI 210-A 34 3299.5
5 1006 2013-01-07 BEIJING 130-F 32 4432.0
df['group']=np.where(df['price']>3000,'high','low')
df
#如果prince列的值>3000,group列显示high,否则显示low
id date city category age price group
0 1001 2013-01-02 BEIJING 100-A 23 1200.0 low
1 1002 2013-01-03 SH 100-B 44 3299.5 high
2 1003 2013-01-04 GUANGZHOU 110-A 54 2133.0 low
3 1004 2013-01-05 SHENZHEN 110-C 32 5433.0 high
4 1005 2013-01-06 SHANGHAI 210-A 34 3299.5 high
5 1006 2013-01-07 BEIJING 130-F 32 4432.0 high
df_inner
id date city category age price gender pay m-point
0 1001 2013-01-02 BEIJING 100-A 23 1200.0 female N 12
1 1002 2013-01-03 SH 100-B 44 3299.5 male Y 10
2 1003 2013-01-04 GUANGZHOU 110-A 54 2133.0 male Y 20
3 1004 2013-01-05 SHENZHEN 110-C 32 5433.0 female Y 40
4 1005 2013-01-06 SHANGHAI 210-A 34 3299.5 male N 40
5 1006 2013-01-07 BEIJING 130-F 32 4432.0 female Y 40
#对复合多个条件的数据进行分组标记
df_inner.loc[(df_inner['city']=='BEIJING')&(df_inner['price']>=4000),'sign']=1
df_inner
id date city category age price gender pay m-point sign
0 1001 2013-01-02 BEIJING 100-A 23 1200.0 female N 12 NaN
1 1002 2013-01-03 SH 100-B 44 3299.5 male Y 10 NaN
2 1003 2013-01-04 GUANGZHOU 110-A 54 2133.0 male Y 20 NaN
3 1004 2013-01-05 SHENZHEN 110-C 32 5433.0 female Y 40 NaN
4 1005 2013-01-06 SHANGHAI 210-A 34 3299.5 male N 40 NaN
5 1006 2013-01-07 BEIJING 130-F 32 4432.0 female Y 40 1.0
# pd.DataFrame((x.split('-')for x in df_inner['category']),index=df_inner.index,columns=['category','size'])

数据提取 loc,iloc和ix,loc函数按标签值进行提取,iloc按位置进行提取,ix可以同时按标签和位置进行提取.

df_inner.loc[3]#按索引提取单行的数值
id                         1004
date        2013-01-05 00:00:00
city                   SHENZHEN
category                  110-C
age                          32
price                      5433
gender                   female
pay                           Y
m-point                      40
sign                        NaN
Name: 3, dtype: object
df_inner.iloc[0:5]#按索引提取区域行数值
id date city category age price gender pay m-point sign
0 1001 2013-01-02 BEIJING 100-A 23 1200.0 female N 12 NaN
1 1002 2013-01-03 SH 100-B 44 3299.5 male Y 10 NaN
2 1003 2013-01-04 GUANGZHOU 110-A 54 2133.0 male Y 20 NaN
3 1004 2013-01-05 SHENZHEN 110-C 32 5433.0 female Y 40 NaN
4 1005 2013-01-06 SHANGHAI 210-A 34 3299.5 male N 40 NaN
df_inner=df_inner.set_index('date') #设置日期为索引
df_inner.iloc[[0,2,5],[4,5]]#提取第0、2、5行,4、5列
price gender
date
2013-01-02 1200.0 female
2013-01-04 2133.0 male
2013-01-07 4432.0 female
df_inner.iloc[:3,:2]#冒号前后的数字不再是索引的标签名称,而是数据所在的位置,从0开始,前三行,前两列
id city
date
2013-01-02 1001 BEIJING
2013-01-03 1002 SH
2013-01-04 1003 GUANGZHOU
df_inner.ix[:'2013-01-03',:4]#使用ix按索引标签和位置混合提取数据,2013-01-03号之前,前四列数据
G:\Anaconda\lib\site-packages\ipykernel_launcher.py:1: DeprecationWarning:
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexingSee the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated"""Entry point for launching an IPython kernel.
id city category age
date
2013-01-02 1001 BEIJING 100-A 23
2013-01-03 1002 SH 100-B 44
df_inner.loc[df_inner['city'].isin(['BEIJING','SHANGHAI'])]#判断city列里是否包含beijing和shanghai,然后将符合条件的数据提取出来
id city category age price gender pay m-point sign
date
2013-01-02 1001 BEIJING 100-A 23 1200.0 female N 12 NaN
2013-01-06 1005 SHANGHAI 210-A 34 3299.5 male N 40 NaN
2013-01-07 1006 BEIJING 130-F 32 4432.0 female Y 40 1.0
pd.DataFrame(df_inner["category"].str[:3])#提取前三个字符,并生成数据表
category
date
2013-01-02 100
2013-01-03 100
2013-01-04 110
2013-01-05 110
2013-01-06 210
2013-01-07 130
pd.DataFrame(df_inner["category"].str[-1])#提取最后一个字符,并生成数据表
category
date
2013-01-02 A
2013-01-03 B
2013-01-04 A
2013-01-05 C
2013-01-06 A
2013-01-07 F

数据筛选

使用与、或、非三个条件配合大于、小于、等于对数据进行筛选,并进行计数和求和

df_inner
id city category age price gender pay m-point sign
date
2013-01-02 1001 BEIJING 100-A 23 1200.0 female N 12 NaN
2013-01-03 1002 SH 100-B 44 3299.5 male Y 10 NaN
2013-01-04 1003 GUANGZHOU 110-A 54 2133.0 male Y 20 NaN
2013-01-05 1004 SHENZHEN 110-C 32 5433.0 female Y 40 NaN
2013-01-06 1005 SHANGHAI 210-A 34 3299.5 male N 40 NaN
2013-01-07 1006 BEIJING 130-F 32 4432.0 female Y 40 1.0
df_inner.loc[(df_inner['age']>25)&(df_inner['city']=='BEIJING'),['id','city','age','price','gender','pay','sign']]#与筛选
id city age price gender pay sign
date
2013-01-07 1006 BEIJING 32 4432.0 female Y 1.0
df_inner.loc[(df_inner['age']>25)|(df_inner['city']=='BEIJING'),['id','city','age','price','gender','pay','sign']]#或筛选
id city age price gender pay sign
date
2013-01-02 1001 BEIJING 23 1200.0 female N NaN
2013-01-03 1002 SH 44 3299.5 male Y NaN
2013-01-04 1003 GUANGZHOU 54 2133.0 male Y NaN
2013-01-05 1004 SHENZHEN 32 5433.0 female Y NaN
2013-01-06 1005 SHANGHAI 34 3299.5 male N NaN
2013-01-07 1006 BEIJING 32 4432.0 female Y 1.0
df_inner.loc[(df_inner['city']!='BEIJING'),['id','city','age','price','gender','pay','sign']]#非筛选
id city age price gender pay sign
date
2013-01-03 1002 SH 44 3299.5 male Y NaN
2013-01-04 1003 GUANGZHOU 54 2133.0 male Y NaN
2013-01-05 1004 SHENZHEN 32 5433.0 female Y NaN
2013-01-06 1005 SHANGHAI 34 3299.5 male N NaN
df_inner.loc[(df_inner['city']!='BEIJING'),['id','city','age','price','gender','pay','sign']].city.count()#筛选后的数据按照city数计数
4
df_inner.query('["BEIJING","SHANGHAI"]==city').price.sum()#使用query函数进行筛选,再求和
8931.5

数据汇总

df_inner.groupby('city').count()#对所有的列进行计数汇总
id category age price gender pay m-point sign
city
BEIJING 2 2 2 2 2 2 2 1
GUANGZHOU 1 1 1 1 1 1 1 0
SH 1 1 1 1 1 1 1 0
SHANGHAI 1 1 1 1 1 1 1 0
SHENZHEN 1 1 1 1 1 1 1 0
df_inner.groupby('city')['id'].count()#按城市对id字段进行计数
city
BEIJING      2
GUANGZHOU    1
SH           1
SHANGHAI     1
SHENZHEN     1
Name: id, dtype: int64
df_inner.groupby(['city','price'])['id'].count()#对两个字段进行汇总计数
city       price
BEIJING    1200.0    14432.0    1
GUANGZHOU  2133.0    1
SH         3299.5    1
SHANGHAI   3299.5    1
SHENZHEN   5433.0    1
Name: id, dtype: int64
df_inner.groupby('city')['price'].agg([len,np.sum,np.mean])#对city字段进行汇总,并分别计算prince的合计和均值
len sum mean
city
BEIJING 2.0 5632.0 2816.0
GUANGZHOU 1.0 2133.0 2133.0
SH 1.0 3299.5 3299.5
SHANGHAI 1.0 3299.5 3299.5
SHENZHEN 1.0 5433.0 5433.0
df_inner.sample(n=2)#采样
id city category age price gender pay m-point sign
date
2013-01-05 1004 SHENZHEN 110-C 32 5433.0 female Y 40 NaN
2013-01-07 1006 BEIJING 130-F 32 4432.0 female Y 40 1.0
weights = [0, 0, 0, 0, 0.5, 0.5]
df_inner.sample(n=2, weights=weights)#手动设置采样权重
id city category age price gender pay m-point sign
date
2013-01-07 1006 BEIJING 130-F 32 4432.0 female Y 40 1.0
2013-01-06 1005 SHANGHAI 210-A 34 3299.5 male N 40 NaN
df_inner.sample(n=6, replace=False) #采样不放回
id city category age price gender pay m-point sign
date
2013-01-04 1003 GUANGZHOU 110-A 54 2133.0 male Y 20 NaN
2013-01-03 1002 SH 100-B 44 3299.5 male Y 10 NaN
2013-01-02 1001 BEIJING 100-A 23 1200.0 female N 12 NaN
2013-01-05 1004 SHENZHEN 110-C 32 5433.0 female Y 40 NaN
2013-01-07 1006 BEIJING 130-F 32 4432.0 female Y 40 1.0
2013-01-06 1005 SHANGHAI 210-A 34 3299.5 male N 40 NaN
df_inner.describe().round(2).T #round函数设置显示小数位,T表示转置
count mean std min 25% 50% 75% max
id 6.0 1003.5 1.87 1001.0 1002.25 1003.5 1004.75 1006.0
age 6.0 36.5 10.88 23.0 32.00 33.0 41.50 54.0
price 6.0 3299.5 1523.35 1200.0 2424.62 3299.5 4148.88 5433.0
m-point 6.0 27.0 14.63 10.0 14.00 30.0 40.00 40.0
sign 1.0 1.0 NaN 1.0 1.00 1.0 1.00 1.0
df_inner['price'].std()#计算列的标准差
1523.3516337339847
df_inner['price'].cov(df_inner['m-point'])#计算两个字段间的协方差
16423.2
df_inner.cov()#数据表中所有字段间的协方差
id age price m-point sign
id 3.5 -0.7 1946.0 25.0 NaN
age -0.7 118.3 -1353.5 -39.4 NaN
price 1946.0 -1353.5 2320600.2 16423.2 NaN
m-point 25.0 -39.4 16423.2 214.0 NaN
sign NaN NaN NaN NaN NaN
df_inner.corr() #相关系数在-1到1之间,接近1为正相关,接近-1为负相关,0为不相关
id age price m-point sign
id 1.000000 -0.034401 0.682824 0.913480 NaN
age -0.034401 1.000000 -0.081689 -0.247626 NaN
price 0.682824 -0.081689 1.000000 0.736972 NaN
m-point 0.913480 -0.247626 0.736972 1.000000 NaN
sign NaN NaN NaN NaN NaN
df_inner['price'].corr(df_inner['m-point'])#df_inner['price'].corr(df_inner['m-point'])#
0.7369715663870097
df_inner.to_csv('df_inner.csv')#数据写入到csv

pandas用法总结相关推荐

  1. Python pandas用法

    Python pandas用法 无味之味关注 12019.01.10 15:43:25字数 2,877阅读 91,914 介绍 在Python中,pandas是基于NumPy数组构建的,使数据预处理. ...

  2. 用python的pandas打开csv文件_python读写数据读写csv文件(pandas用法)

    python中数据处理是比较方便的,经常用的就是读写文件,提取数据等,本博客主要介绍其中的一些用法.Pandas是一个强大的分析结构化数据的工具集;它的使用基础是Numpy(提供高性能的矩阵运算);用 ...

  3. pandas用法小结

    前言 个人感觉网上对pandas的总结感觉不够详尽细致,在这里我对pandas做个相对细致的小结吧,在数据分析与人工智能方面会有所涉及到的东西在这里都说说吧,也是对自己学习的一种小结! pandas用 ...

  4. Pandas 用法总结

    Pandas 用法总结 Pandas 简述: Pandas 是什么? Pandas是一个强大的分析结构化数据的工具集:它的使用基础是Numpy(提供高性能的矩阵运算):用于数据挖掘和数据分析,同时也提 ...

  5. python panda用法_Python3 pandas用法大全

    Python3 pandas用法大全 一.生成数据表 1.首先导入pandas库,一般都会用到numpy库,所以我们先导入备用: importnumpy as npimport pandas as p ...

  6. 【机器学习基础】前置知识(四):一文掌握Pandas用法

    Pandas提供快速,灵活和富于表现力的数据结构,是强大的数据分析Python库. 本文收录于机器学习前置教程系列. 一.Series和DataFrame Pandas建立在NumPy之上,更多Num ...

  7. python panda用法_Python Pandas用法入门

    简介 首先pandas是基于numpy进行开发的. Pandas 的主要数据结构是 Series(一维数据)与 DataFrame(二维数据),这两种数据结构足以处理金融.统计.社会科学.工程等领域里 ...

  8. python pandas 分类汇总用法_python之pandas用法大全

    一.生成数据表 1.首先导入pandas库,一般都会用到numpy库,所以我们先导入备用: import numpy as np import pandas as pd 2.导入CSV或者xlsx文件 ...

  9. Pandas教程(pandas用法)

    Pandas 基本介绍 Numpy 和 Pandas 有什么不同 如果用 python 的列表和字典来作比较, 那么可以说 Numpy 是列表形式的,没有数值标签,而 Pandas 就是字典形式.Pa ...

  10. pandas用法详解

    一.生成数据表 1.首先导入pandas库,一般都会用到numpy库,所以我们先导入备用: import numpy as np import pandas as pd 2.导入CSV或者xlsx文件 ...

最新文章

  1. 综合素质计算机能力,教资统考中学《综合素质》:信息处理能力(一)
  2. 「 每日一练,快乐水题 」744. 寻找比目标字母大的最小字母
  3. c#实现Socket网络编程
  4. android AlertDialog.Builder
  5. PHP做二次开发:ThinkCMF门户应用安装
  6. python10086查询系统_Python获取移动性能指标
  7. php全选帖子删除,用PHP实现全选全删
  8. 网络连接数4000多正常吗_怀孕36周时胎儿发育情况是怎样的?胎儿体重有4斤多正常吗?...
  9. Redis Cluster部署、管理和测试
  10. linux 使cpu使用率升高_linux性能优化
  11. Candence PCB Allegro①贴片封装绘制
  12. 《壁纸 - 大全精选手机壁纸》EULA条款协议
  13. Civil 3d中的mms文件
  14. python anova_在python中对GLM进行Anova测试
  15. dell服务器服务器数据丢失后,数据恢复
  16. 测试方法——因果图法和判定表法
  17. OSGEarth解决南北极空洞问题
  18. GNU的C++代码书写规范
  19. Java限流策略与算法
  20. 0705第七讲标准模版库

热门文章

  1. Nacos中namespace,groupId,dataId使用
  2. 基于pytorch+transformers的车牌识别
  3. HTTP协议和RPC协议
  4. TP5框架集成极验验证码
  5. IT职业教育(1) 北大青鸟APTECH 1
  6. Xposed模块开发教程
  7. Direct3D 10转型时代所面临的空前窘境
  8. 倍福控制器连接松下EtherCAT伺服注意事项
  9. jdk1.8 Switch 不能使用 String类型
  10. OpenProcessToken LookupPrivilegeValue 和AdjustTokenPrivilege