#!/usr/bin/env python
# coding: utf-8
# TI=FIFA2018球员数据分析
# 明确分析目的
#   运动员数量前十名的国家,以及平均身价
#   各大联赛运动员数量,以及球员平均身价
#   各俱乐部的平均周薪
#   英超联赛English Premier League各个俱乐部球员的平均周薪
#   球员年龄分布情况,不同年龄段平均身价分布
# 引入使用的库
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# 加载数据文件
df = pd.read_csv('./FIFA_2018_player.csv')
# 查看数据具有哪些列,什么类型
df.info()
# 可见共17994行,league和club有缺失值
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17994 entries, 0 to 17993
Data columns (total 12 columns):
ID             17994 non-null int64
name           17994 non-null object
full_name      17994 non-null object
nationality    17994 non-null object
league         17741 non-null object
club           17741 non-null object
age            17994 non-null int64
birth_date     17994 non-null object
height_cm      17994 non-null float64
weight_kg      17994 non-null float64
eur_value      17994 non-null float64
eur_wage       17994 non-null float64
dtypes: float64(4), int64(2), object(6)
memory usage: 1.6+ MB
df.head()
ID name full_name nationality league club age birth_date height_cm weight_kg eur_value eur_wage
0 20801 Cristiano Ronaldo C. Ronaldo dos Santos Aveiro Portugal Spanish Primera División Real Madrid CF 32 1985-02-05 185.0 80.0 95500000.0 565000.0
1 158023 L. Messi Lionel Messi Argentina Spanish Primera División FC Barcelona 30 1987-06-24 170.0 72.0 105000000.0 565000.0
2 190871 Neymar Neymar da Silva Santos Jr. Brazil French Ligue 1 Paris Saint-Germain 25 1992-02-05 175.0 68.0 123000000.0 280000.0
3 176580 L. Suárez Luis Suárez Uruguay Spanish Primera División FC Barcelona 30 1987-01-24 182.0 86.0 97000000.0 510000.0
4 167495 M. Neuer Manuel Neuer Germany German Bundesliga FC Bayern Munich 31 1986-03-27 193.0 92.0 61000000.0 230000.0
df.describe()
ID age height_cm weight_kg eur_value eur_wage
count 17994.000000 17994.000000 17994.000000 17994.000000 1.799400e+04 17994.000000
mean 207791.796543 25.120151 181.271980 75.400856 2.370511e+06 11503.834612
std 32328.527723 4.617428 6.690392 6.994824 5.347250e+06 23050.661073
min 16.000000 16.000000 155.000000 49.000000 0.000000e+00 0.000000
25% 192621.250000 21.000000 177.000000 70.000000 3.000000e+05 2000.000000
50% 214186.000000 25.000000 181.000000 75.000000 7.000000e+05 4000.000000
75% 231615.750000 28.000000 186.000000 80.000000 2.000000e+06 12000.000000
max 241489.000000 47.000000 205.000000 110.000000 1.230000e+08 565000.000000
df.count()
# 可见league 和 club有缺失值
ID             17994
name           17994
full_name      17994
nationality    17994
league         17741
club           17741
age            17994
birth_date     17994
height_cm      17994
weight_kg      17994
eur_value      17994
eur_wage       17994
dtype: int64
# 对于本次的分析目的,其实在加载数据时就可以只加载部分列
# 选出部分列 ID nationality league club age eur_value eur_wage
# 分析的是FIFA2018的数据,age按当年数据计算,birth_date省略
df = df[['ID', 'nationality', 'league', 'club', 'age', 'eur_value', 'eur_wage']]
df
ID nationality league club age eur_value eur_wage
0 20801 Portugal Spanish Primera División Real Madrid CF 32 95500000.0 565000.0
1 158023 Argentina Spanish Primera División FC Barcelona 30 105000000.0 565000.0
2 190871 Brazil French Ligue 1 Paris Saint-Germain 25 123000000.0 280000.0
3 176580 Uruguay Spanish Primera División FC Barcelona 30 97000000.0 510000.0
4 167495 Germany German Bundesliga FC Bayern Munich 31 61000000.0 230000.0
... ... ... ... ... ... ... ...
17989 237463 England English League One Scunthorpe United 17 50000.0 1000.0
17990 11728 England English League Two Wycombe Wanderers 47 0.0 1000.0
17991 231381 Scotland English League Two Swindon Town 17 60000.0 1000.0
17992 238813 England English League Two Crewe Alexandra 18 60000.0 1000.0
17993 238308 Ghana English League One Scunthorpe United 18 50000.0 1000.0

17994 rows × 7 columns

df[df.league.isnull()]
ID nationality league club age eur_value eur_wage
163 188152 Brazil NaN NaN 25 0.0 0.0
168 184826 Portugal NaN NaN 28 0.0 0.0
271 177413 Belgium NaN NaN 28 0.0 0.0
480 176733 Sweden NaN NaN 30 0.0 0.0
494 169195 Brazil NaN NaN 29 0.0 0.0
... ... ... ... ... ... ... ...
17267 234509 India NaN NaN 29 0.0 0.0
17486 234508 India NaN NaN 20 0.0 0.0
17489 223760 India NaN NaN 24 0.0 0.0
17511 233526 India NaN NaN 22 0.0 0.0
17568 231057 New Zealand NaN NaN 20 0.0 0.0

253 rows × 7 columns

# 可以看到联赛、俱乐部是空值,同时这些条目的身价、周薪都是0
# 删除数据一般在后面进行,但四行数据都异常可以先删除
df.drop(df[df.league.isnull()].index,inplace=True)
# 查看删除后情况
df.count()
ID             17741
nationality    17741
league         17741
club           17741
age            17741
eur_value      17741
eur_wage       17741
dtype: int64
# 查看数据中数值、浮点型数据整体信息
df.describe()
ID age eur_value eur_wage
count 17741.000000 17741.000000 1.774100e+04 17741.000000
mean 207756.835522 25.088552 2.404317e+06 11667.887943
std 32421.331072 4.616413 5.377693e+06 23173.181633
min 16.000000 16.000000 0.000000e+00 1000.000000
25% 192621.000000 21.000000 3.250000e+05 2000.000000
50% 214175.000000 25.000000 7.000000e+05 4000.000000
75% 231624.000000 28.000000 2.100000e+06 12000.000000
max 241489.000000 47.000000 1.230000e+08 565000.000000
# 经查看,eur_value 在最小值上有问题,0.000000e+00
# 筛选一下数据,查看eur_value == 0.000000e+00的有多少
df[df['eur_value'] == 0.000000e+00].count()
ID             6
nationality    6
league         6
club           6
age            6
eur_value      6
eur_wage       6
dtype: int64
# eur_value == 0.000000e+00的有6条
df[df['eur_value'] == 0.000000e+00]
ID nationality league club age eur_value eur_wage
2199 3665 France French Ligue 1 ES Troyes AC 40 0.0 16000.0
3105 17605 Belgium Belgian First Division A Club Brugge KV 40 0.0 14000.0
3272 176900 Colombia Colombian Primera A Asociacion Deportivo Cali 40 0.0 2000.0
7734 148745 Norway Norwegian Eliteserien Sogndal 41 0.0 2000.0
17628 149727 England Rep. Ireland Premier Division St. Patrick's Athletic 37 0.0 1000.0
17990 11728 England English League Two Wycombe Wanderers 47 0.0 1000.0
# 使用平均值填充这些身价为0的数据
# df[df['eur_value'] == 0.000000e+00].loc[:,'eur_value'] = 2.404317e+06 警告
# 使用平均值填充这些身价为0的数据
df['eur_value'].replace(0, df['eur_value'].mean(), inplace = True)
df.describe()
# 可见数据已经填充成功
ID age eur_value eur_wage
count 17741.000000 17741.000000 1.774100e+04 17741.000000
mean 207756.835522 25.088552 2.405130e+06 11667.887943
std 32421.331072 4.616413 5.377511e+06 23173.181633
min 16.000000 16.000000 1.000000e+04 1000.000000
25% 192621.000000 21.000000 3.250000e+05 2000.000000
50% 214175.000000 25.000000 7.000000e+05 4000.000000
75% 231624.000000 28.000000 2.100000e+06 12000.000000
max 241489.000000 47.000000 1.230000e+08 565000.000000
# 检查是否有整个条目完全重复值,若有则展示
df[df.duplicated()]
ID nationality league club age eur_value eur_wage
# 检查指定列是否有重复值
df[df['ID'].duplicated()]
ID nationality league club age eur_value eur_wage
# 查看分类统计值是否有不合逻辑的类名
df['league'].value_counts()
Argentinian Superliga            780
English Championship             717
English League One               668
English Premier League           654
Spanish Segunda División         637
English League Two               633
Italian Serie B                  625
USA Major League Soccer          625
Spanish Primera División         602
French Ligue 1                   598
Italian Serie A                  559
Colombian Primera A              552
French Ligue 2                   543
German Bundesliga                537
Japanese J1 League               519
Mexican Liga MX                  518
German 3. Liga                   515
German 2. Bundesliga             510
Portuguese Primeira Liga         509
Turkish Süper Lig                502
Holland Eredivisie               488
Russian Premier League           449
Belgian First Division A         436
Polish Ekstraklasa               418
Saudi Professional League        411
Norwegian Eliteserien            393
Swedish Allsvenskan              389
Danish Superliga                 365
Korean K League Classic          336
Scottish Premiership             321
Chilian Primera División         320
Campeonato Brasileiro Série A    320
Rep. Ireland Premier Division    288
Swiss Super League               263
Austrian Bundesliga              259
Australian A-League              236
Greek Super League               111
South African PSL                 56
Czech Liga                        28
Finnish Veikkausliiga             27
Ukrainian Premier League          24
Name: league, dtype: int64
# 数据清洗完毕,开始分析
# 样本总数
df.count()
ID             17741
nationality    17741
league         17741
club           17741
age            17741
eur_value      17741
eur_wage       17741
dtype: int64
# 数值类型列统计学指标
df.describe()
ID age eur_value eur_wage
count 17741.000000 17741.000000 1.774100e+04 17741.000000
mean 207756.835522 25.088552 2.405130e+06 11667.887943
std 32421.331072 4.616413 5.377511e+06 23173.181633
min 16.000000 16.000000 1.000000e+04 1000.000000
25% 192621.000000 21.000000 3.250000e+05 2000.000000
50% 214175.000000 25.000000 7.000000e+05 4000.000000
75% 231624.000000 28.000000 2.100000e+06 12000.000000
max 241489.000000 47.000000 1.230000e+08 565000.000000
# 运动员数量前十名的国家
nationality_data = df.groupby('nationality', as_index = False)  #拿出按国家分组的数据
nat_count = nationality_data.count()[['nationality','ID']]     #计数,拿出国家和ID两列
nat_count.rename(columns = {'ID':'ath_count'}, inplace = True) #对列名重命名
nat_head10 = nat_count.sort_values('ath_count', ascending = False).head(10)#降序排序,取前十
nat_head10
nationality ath_count
44 England 1631
57 Germany 1147
135 Spain 1020
53 France 966
5 Argentina 962
18 Brazil 803
75 Italy 800
29 Colombia 591
78 Japan 471
105 Netherlands 430
# 运动员数量前十名的国家及其平均球员身价
nat_val_mean = nationality_data[['nationality','eur_value']].mean()
nat_val_mean.rename(columns = {'eur_value':'val_mean'})
nat_head10_val_mean = pd.merge(nat_head10,nat_val_mean, on = 'nationality', how = 'left')
nat_head10_val_mean
nationality ath_count eur_value
0 England 1631 1.425410e+06
1 Germany 1147 2.609010e+06
2 Spain 1020 4.465897e+06
3 France 966 3.314264e+06
4 Argentina 962 2.900120e+06
5 Brazil 803 4.001071e+06
6 Italy 800 2.681325e+06
7 Colombia 591 1.719068e+06
8 Japan 471 8.067091e+05
9 Netherlands 430 3.002930e+06
# 各大联赛运动员数量,以及球员平均身价(操作方式与上述方法类似)
league_data = df.groupby('league', as_index = False)
league_count = league_data.count()[['league','ID']].rename(columns = {'ID':'ath_count'})
league_count.rename(columns = {'ID':'ath_count'}).sort_values('ath_count', ascending = False)
lea_val_mean = league_data[['league','eur_value']].mean().rename(columns = {'eur_value':'val_mean'})
lea_val_mean = pd.merge(league_count, lea_val_mean, on = 'league', how = 'left')
lea_val_mean
league ath_count val_mean
0 Argentinian Superliga 780 1.453788e+06
1 Australian A-League 236 6.848941e+05
2 Austrian Bundesliga 259 7.276062e+05
3 Belgian First Division A 436 1.956719e+06
4 Campeonato Brasileiro Série A 320 2.249016e+06
5 Chilian Primera División 320 2.238234e+06
6 Colombian Primera A 552 9.465567e+05
7 Czech Liga 28 2.141250e+06
8 Danish Superliga 365 7.188767e+05
9 English Championship 717 1.831032e+06
10 English League One 668 4.875075e+05
11 English League Two 633 2.926687e+05
12 English Premier League 654 9.091483e+06
13 Finnish Veikkausliiga 27 2.940741e+05
14 French Ligue 1 598 5.188201e+06
15 French Ligue 2 543 8.015930e+05
16 German 2. Bundesliga 510 1.238333e+06
17 German 3. Liga 515 4.530777e+05
18 German Bundesliga 537 7.702849e+06
19 Greek Super League 111 3.808333e+06
20 Holland Eredivisie 488 2.171250e+06
21 Italian Serie A 559 7.292030e+06
22 Italian Serie B 625 8.344480e+05
23 Japanese J1 League 519 6.472736e+05
24 Korean K League Classic 336 8.854911e+05
25 Mexican Liga MX 518 2.025782e+06
26 Norwegian Eliteserien 393 6.056217e+05
27 Polish Ekstraklasa 418 7.070096e+05
28 Portuguese Primeira Liga 509 3.506257e+06
29 Rep. Ireland Premier Division 288 1.683310e+05
30 Russian Premier League 449 2.679788e+06
31 Saudi Professional League 411 8.512287e+05
32 Scottish Premiership 321 9.114486e+05
33 South African PSL 56 1.150893e+06
34 Spanish Primera División 602 9.257550e+06
35 Spanish Segunda División 637 1.508854e+06
36 Swedish Allsvenskan 389 6.417095e+05
37 Swiss Super League 263 1.134202e+06
38 Turkish Süper Lig 502 2.961036e+06
39 USA Major League Soccer 625 1.484424e+06
40 Ukrainian Premier League 24 8.283750e+06
# 各俱乐部的平均周薪
club_data = df.groupby('club', as_index = False)
club_wage_mean = club_data.mean()[['club','eur_wage']]
club_wage_mean.rename(columns = {'eur_wage':'wage_mean'},inplace = True)
club_wage_mean.sort_values('wage_mean', ascending = False, inplace = True)
club_wage_mean
club wage_mean
219 FC Barcelona 194666.666667
466 Real Madrid CF 170821.428571
222 FC Bayern Munich 123384.615385
330 Juventus 122000.000000
377 Manchester United 109030.303030
... ... ...
97 Bray Wanderers 1000.000000
425 PAOK Thessaloniki 1000.000000
88 Bohemian FC 1000.000000
263 Finn Harps 1000.000000
578 Tigres FC 1000.000000

647 rows × 2 columns

# 英超联赛English Premier League
EPL_data = df[df['league'] == 'English Premier League']
EPL_data.describe()
ID age eur_value eur_wage
count 654.000000 654.000000 6.540000e+02 654.000000
mean 196333.365443 24.711009 9.091483e+06 57840.978593
std 37997.349392 4.769797 1.222195e+07 50627.145927
min 2147.000000 16.000000 6.000000e+04 2000.000000
25% 183551.250000 20.000000 9.125000e+05 17000.000000
50% 201840.000000 25.000000 5.000000e+06 48000.000000
75% 222604.000000 28.000000 1.137500e+07 82000.000000
max 241384.000000 38.000000 9.050000e+07 325000.000000
# 英超联赛English Premier League各个俱乐部球员的平均周薪
EPL_club = EPL_data.groupby('club', as_index = False)
EPL_club_wage_mean = EPL_club.mean()[['club','eur_wage']]
EPL_club_wage_mean.rename(columns = {'eur_wage':'wage_mean'}, inplace = True)
EPL_club_wage_mean.sort_values('wage_mean', ascending = False, inplace = True)
EPL_club_wage_mean
club wage_mean
11 Manchester United 109030.303030
4 Chelsea 105181.818182
10 Manchester City 95787.878788
0 Arsenal 91121.212121
9 Liverpool 83250.000000
6 Everton 76484.848485
16 Tottenham Hotspur 69218.750000
19 West Ham United 61818.181818
13 Southampton 51181.818182
15 Swansea City 43878.787879
8 Leicester City 43875.000000
18 West Bromwich Albion 42516.129032
14 Stoke City 41093.750000
17 Watford 40848.484848
12 Newcastle United 40000.000000
1 Bournemouth 38303.030303
5 Crystal Palace 35181.818182
3 Burnley 33666.666667
2 Brighton & Hove Albion 30454.545455
7 Huddersfield Town 23181.818182
# 球员年龄分布情况,不同年龄段平均身价分布
# ⽣成桶,5岁⼀个分桶,根据上述统计数据可知最⼩16岁,最⼤47
bins = np.arange(15, 50, 5)
bins_data = pd.cut(df['age'], bins)
bin_counts = df['age'].groupby(bins_data).count()
print(bin_counts)
# 可以使用matplotlib粗看一下作图效果,以便在正式出图前作调整
bin_counts.plot(kind='pie')
age
(15, 20]    3300
(20, 25]    6749
(25, 30]    5234
(30, 35]    2192
(35, 40]     258
(40, 45]       7
Name: age, dtype: int64
<matplotlib.axes._subplots.AxesSubplot at 0x14cd9c6bf48>

Python数据分析-FIFA2018球员数据分析相关推荐

  1. 案例分析:FIFA2018球员数据分析

    案例分析练习: FIFA2018球员数据分析 # 引入要使用的库 import numpy as np import pandas as pd import matplotlib.pyplot as ...

  2. 案例:FIFA2018球员数据分析

    对FIFA2018球员做数据分析 整体思路: 1.确定分析目标 2.导入数据文件 3.查看数据 4.确定分析维度和指标 5.清理需要的数据 6.利用数据做分析 7.根据需要做图 首先要对FIFA_20 ...

  3. nba球员数据分析和可视化_可视化NBA球员统计

    nba球员数据分析和可视化 I haven't written a post in a while. I had a lot to do for university and my hobbies l ...

  4. 利用Python进行NBA比赛数据分析

    分享一下我老师大神的人工智能教程!零基础,通俗易懂!http://blog.csdn.net/jiangjunshow 也欢迎大家转载本篇文章.分享知识,造福人民,实现我们中华民族伟大复兴! 利用Py ...

  5. Python中常用的数据分析工具(模块)有哪些?

    本期Python培训分享:Python中常用的数据分析工具(模块)有哪些?Python本身的数据分析功能并不强,需要安装一些第三方的扩展库来增强它的能力.我们课程用到的库包括NumPy.Pandas. ...

  6. python创建变量revenue_Python数据分析:小红书销售额预测

    一.分析背景 根据小红书的部分用户数据以及消费行为数据,使用Python建立线性回归模型,找到对用户消费影响较大的因素,预测用户的消费金额变化.根据模型,确定销售额较高用户的相关特征,并由此提出营销方 ...

  7. python官方推荐的三本书-【数据分析】入门数据分析,你一定要看的三本书

    原标题:[数据分析]入门数据分析,你一定要看的三本书 最近经常被问到怎么入门数据分析,可能很多同学对怎么开始学习还是比较困惑的.我回想自己学习数据分析的经历,总结了一些建议,希望能给到大家帮助. 打好 ...

  8. excel调用python编程-使用python集合进行EXCEL数据分析

    使用python集合进行EXCEL数据分析 标准库 Python真正精彩的方面之一是它具有非常丰富的模块标准库 ,无需安装第三方模块即可进行一些复杂的编程. 如果您在系统上有效安装了python,则可 ...

  9. python 数据分析学什么-数据分析培训学习多久?都学什么?

    数据分析培训学习多久?以博学谷的Python数据分析与应用课程为例,学习数据分析只要22课时.那么都学什么呢?内容大概包括了Python数据分析的相关知识.感兴趣的小伙伴可以一起来看看课程介绍. 1. ...

最新文章

  1. 迪克森沉思录之做Global SAP项目的弊端
  2. unity3d api 中文文档_接口文档系统-showdoc安装部署
  3. J2EE学习中一些值得研究的开源项目(转载天极网)
  4. 计算机录入速度标准,怎么提高电脑录入速度?
  5. Visual Studio listView控件绑定SQL Server数据库并动态显示数据,调整列宽
  6. 前端学习(3121):react-hello-react的state的简写方式
  7. 计算机科学导论第12版答案,计算机科学导论第12章参考答案.pdf
  8. Lattice FPGA 开发工具Diamond使用流程总结——安装
  9. 大智慧炒股软件下载|选股软件
  10. Shottr 免费好用的Mac 截屏软件
  11. 关于一个微信公众号:原子与分子模拟
  12. 如何优化前端页面的LCP?
  13. 弄清USART串口的使能位(UE、TCIE、RXNEIE)和标志位(TC、RXNE)
  14. 新版本读取老版本文件崩溃BUG
  15. MFC修改界面图标时,已导入.ico文件,但是程序运行后,界面不显示更新后的新图标
  16. 计算机基础知识题精选
  17. C语言 三个数字比较大小
  18. 解决在JS中阻止定时器“重复”开启问题、Vue中定时器的使用
  19. 2. C++ Visual Studio中同一个项目包含多个有main函数的源文件怎么分别运行?
  20. KNN实现小麦种子分类问题

热门文章

  1. AT24C02详解(蓝桥杯单片机模块(IIC总线))
  2. JQuery在线引用地址整理
  3. 旅游网站(注册功能)
  4. 输入三个整数x,y,z,请把这三个数由小到大输出。
  5. Lua 实现JSON解析器
  6. HTML 各种按钮,图片按钮,打开文件按钮
  7. 酷比魔方i7手写版linux网卡驱动,酷比魔方手写板安装Ubuntu 16.04
  8. 创建用户tea,stu,并给这两个用户resource,connect角色
  9. 导图整理数组1: 总结了二分查找的通用模板写法, 彻底解决几个易混淆问题, 力扣35:搜索插入位置
  10. React 中的受控组件和非受控组件的区别