爱篮球,爱人工智能,爱生活。

探索性的对科比数据集进行分析

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inlinefrom sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import KFold
# import data
filename= "data.csv"
raw = pd.read_csv(filename)
print (raw.shape)
raw.head()

打印结果:

# 5000 for test
kobe =  raw[pd.notnull(raw['shot_made_flag'])]
print (kobe.shape)

打印结果:

(25697, 25)
#plt.subplot(211) first is raw second Column
alpha = 0.02
plt.figure(figsize=(10,10))# loc_x and loc_y
plt.subplot(121)
plt.scatter(kobe.loc_x, kobe.loc_y, color='R', alpha=alpha)
plt.title('loc_x and loc_y')# lat and lon
plt.subplot(122)
plt.scatter(kobe.lon, kobe.lat, color='B', alpha=alpha)
plt.title('lat and lon')

打印结果:

raw['dist'] = np.sqrt(raw['loc_x']**2 + raw['loc_y']**2)loc_x_zero = raw['loc_x'] == 0
#print (loc_x_zero)
raw['angle'] = np.array([0]*len(raw))
raw['angle'][~loc_x_zero] = np.arctan(raw['loc_y'][~loc_x_zero] / raw['loc_x'][~loc_x_zero])
raw['angle'][loc_x_zero] = np.pi / 2
raw['remaining_time'] = raw['minutes_remaining'] * 60 + raw['seconds_remaining']
print(kobe.action_type.unique())
print(kobe.combined_shot_type.unique())
print(kobe.shot_type.unique())
print(kobe.shot_type.value_counts())

打印结果:

['Jump Shot' 'Driving Dunk Shot' 'Layup Shot' 'Running Jump Shot''Reverse Dunk Shot' 'Slam Dunk Shot' 'Driving Layup Shot''Turnaround Jump Shot' 'Reverse Layup Shot' 'Tip Shot''Running Hook Shot' 'Alley Oop Dunk Shot' 'Dunk Shot''Alley Oop Layup shot' 'Running Dunk Shot' 'Driving Finger Roll Shot''Running Layup Shot' 'Finger Roll Shot' 'Fadeaway Jump Shot''Follow Up Dunk Shot' 'Hook Shot' 'Turnaround Hook Shot' 'Jump Hook Shot''Running Finger Roll Shot' 'Jump Bank Shot' 'Turnaround Finger Roll Shot''Hook Bank Shot' 'Driving Hook Shot' 'Running Tip Shot''Running Reverse Layup Shot' 'Driving Finger Roll Layup Shot''Fadeaway Bank shot' 'Pullup Jump shot' 'Finger Roll Layup Shot''Turnaround Fadeaway shot' 'Driving Reverse Layup Shot''Driving Slam Dunk Shot' 'Step Back Jump shot' 'Turnaround Bank shot''Reverse Slam Dunk Shot' 'Floating Jump shot' 'Putback Slam Dunk Shot''Running Bank shot' 'Driving Bank shot' 'Driving Jump shot''Putback Layup Shot' 'Putback Dunk Shot' 'Running Finger Roll Layup Shot''Pullup Bank shot' 'Running Slam Dunk Shot' 'Cutting Layup Shot''Driving Floating Jump Shot' 'Running Pull-Up Jump Shot' 'Tip Layup Shot''Driving Floating Bank Jump Shot']
['Jump Shot' 'Dunk' 'Layup' 'Tip Shot' 'Hook Shot' 'Bank Shot']
['2PT Field Goal' '3PT Field Goal']
2PT Field Goal    20285
3PT Field Goal     5412
Name: shot_type, dtype: int64
kobe['season'].unique()

打印结果:

array(['2000-01', '2001-02', '2002-03', '2003-04', '2004-05', '2005-06','2006-07', '2007-08', '2008-09', '2009-10', '2010-11', '2011-12','2012-13', '2013-14', '2014-15', '2015-16', '1996-97', '1997-98','1998-99', '1999-00'], dtype=object)
raw['season'] = raw['season'].apply(lambda x: int(x.split('-')[1]) )
raw['season'].unique()

打印结果:

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 97,98, 99,  0], dtype=int64)
pd.DataFrame({'matchup':kobe.matchup, 'opponent':kobe.opponent})

打印结果:

plt.figure(figsize=(5,5))plt.scatter(raw.dist, raw.shot_distance, color='blue')
plt.title('dist and shot_distance')

打印结果:

gs = kobe.groupby('shot_zone_area')
print (kobe['shot_zone_area'].value_counts())
print (len(gs))

打印结果:

Center(C)                11289
Right Side Center(RC)     3981
Right Side(R)             3859
Left Side Center(LC)      3364
Left Side(L)              3132
Back Court(BC)              72
Name: shot_zone_area, dtype: int64
6
import matplotlib.cm as cm
plt.figure(figsize=(20,10))def scatter_plot_by_category(feat):alpha = 0.1gs = kobe.groupby(feat)cs = cm.rainbow(np.linspace(0, 1, len(gs)))for g, c in zip(gs, cs):plt.scatter(g[1].loc_x, g[1].loc_y, color=c, alpha=alpha)# shot_zone_area
plt.subplot(131)
scatter_plot_by_category('shot_zone_area')
plt.title('shot_zone_area')# shot_zone_basic
plt.subplot(132)
scatter_plot_by_category('shot_zone_basic')
plt.title('shot_zone_basic')# shot_zone_range
plt.subplot(133)
scatter_plot_by_category('shot_zone_range')
plt.title('shot_zone_range')

打印结果:

drops = ['shot_id', 'team_id', 'team_name', 'shot_zone_area', 'shot_zone_range', 'shot_zone_basic', \'matchup', 'lon', 'lat', 'seconds_remaining', 'minutes_remaining', \'shot_distance', 'loc_x', 'loc_y', 'game_event_id', 'game_id', 'game_date']
for drop in drops:raw = raw.drop(drop, 1)
print (raw['combined_shot_type'].value_counts())
pd.get_dummies(raw['combined_shot_type'], prefix='combined_shot_type')[0:2]

打印结果:

categorical_vars = ['action_type', 'combined_shot_type', 'shot_type', 'opponent', 'period', 'season']
for var in categorical_vars:raw = pd.concat([raw, pd.get_dummies(raw[var], prefix=var)], 1)raw = raw.drop(var, 1)
train_kobe = raw[pd.notnull(raw['shot_made_flag'])]
train_label = train_kobe['shot_made_flag']
train_kobe = train_kobe.drop('shot_made_flag', 1)test_kobe = raw[pd.isnull(raw['shot_made_flag'])]
test_kobe = test_kobe.drop('shot_made_flag', 1)
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import confusion_matrix,log_loss
import time
import numpy as np
range_m = np.logspace(0,2,num=5).astype(int)
range_m
# find the best n_estimators for RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import KFoldprint('Finding best n_estimators for RandomForestClassifier...')
min_score = 100000
best_n = 0
scores_n = []
range_n = np.linspace(1,100,num=10).astype(int)
for n in range_n:print("the number of trees : {0}".format(n))t1 = time.time()rfc_score = 0.rfc = RandomForestClassifier(n_estimators=n)for train_k, test_k in KFold(len(train_kobe), n_folds=10, shuffle=True):rfc.fit(train_kobe.iloc[train_k], train_label.iloc[train_k])#rfc_score += rfc.score(train.iloc[test_k], train_y.iloc[test_k])/10pred = rfc.predict(train_kobe.iloc[test_k])rfc_score += log_loss(train_label.iloc[test_k], pred) / 10scores_n.append(rfc_score)if rfc_score < min_score:min_score = rfc_scorebest_n = nt2 = time.time()print('Done processing {0} trees ({1:.3f}sec)'.format(n, t2-t1))
print(best_n, min_score)# find best max_depth for RandomForestClassifier
print('Finding best max_depth for RandomForestClassifier...')
min_score = 100000
best_m = 0
scores_m = []
range_m = np.logspace(0,2,num=3).astype(int)
for m in range_m:print("the max depth : {0}".format(m))t1 = time.time()rfc_score = 0.rfc = RandomForestClassifier(max_depth=m, n_estimators=best_n)for train_k, test_k in KFold(len(train_kobe), n_folds=10, shuffle=True):rfc.fit(train_kobe.iloc[train_k], train_label.iloc[train_k])#rfc_score += rfc.score(train.iloc[test_k], train_y.iloc[test_k])/10pred = rfc.predict(train_kobe.iloc[test_k])rfc_score += log_loss(train_label.iloc[test_k], pred) / 10scores_m.append(rfc_score)if rfc_score < min_score:min_score = rfc_scorebest_m = mt2 = time.time()print('Done processing {0} trees ({1:.3f}sec)'.format(m, t2-t1))
print(best_m, min_score)

打印结果:

Finding best n_estimators for RandomForestClassifier...
the number of trees : 1
Done processing 1 trees (1.318sec)
the number of trees : 12
Done processing 12 trees (9.057sec)
the number of trees : 23
Done processing 23 trees (16.842sec)
the number of trees : 34
Done processing 34 trees (24.728sec)
the number of trees : 45
Done processing 45 trees (32.499sec)
the number of trees : 56
Done processing 56 trees (40.416sec)
the number of trees : 67
Done processing 67 trees (48.344sec)
the number of trees : 78
Done processing 78 trees (56.638sec)
the number of trees : 89
Done processing 89 trees (64.037sec)
the number of trees : 100
Done processing 100 trees (72.883sec)
67 11.840059155801537
Finding best max_depth for RandomForestClassifier...
the max depth : 1
Done processing 1 trees (5.090sec)
[0. 0. 0. ... 0. 0. 1.]
the max depth : 10
Done processing 10 trees (16.442sec)
[1. 1. 1. ... 0. 0. 0.]
the max depth : 100
Done processing 100 trees (49.503sec)
[0. 0. 1. ... 1. 1. 1.]
10 10.983861400622605
plt.figure(figsize=(10,5))
plt.subplot(121)
plt.plot(range_n, scores_n)
plt.ylabel('score')
plt.xlabel('number of trees')plt.subplot(122)
plt.plot(range_m, scores_m)
plt.ylabel('score')
plt.xlabel('max depth')

打印结果:

model = RandomForestClassifier(n_estimators=best_n, max_depth=best_m)
model.fit(train_kobe, train_label)

打印结果:

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',max_depth=10, max_features='auto', max_leaf_nodes=None,min_impurity_decrease=0.0, min_impurity_split=None,min_samples_leaf=1, min_samples_split=2,min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,oob_score=False, random_state=None, verbose=0,warm_start=False)

科比数据集分析及预测相关推荐

  1. 宝可梦数据集分析及预测

    前言 以下内容为本人学习过程中记录,仅用于学习,如有错误或者纰漏,请留言指正,谢谢. 数据集和代码下载 – 百度云链接:https://pan.baidu.com/s/1RFUEVcD85J2AQ3_ ...

  2. 机器学习实战之科比数据集分析(随机森林寻最优值参数)

    文章目录 总体思路分为三部 1.查看数据,对数据进行清洗,规约 1.1 查看数据 1.2 数据清洗,规约 1.3 删除不相关的特征 1.4 数据one-hot处理* 2.建立模型,挑选出最优参数 2. ...

  3. NBA球星生涯数据集分析

    源码链接: https://download.csdn.net/download/qq_58012062/87541713?spm=1001.2014.3001.5501 数据提取:链接:https: ...

  4. python计算多个模型在不同数据集上的预测概率、获取每个数据集上的最优模型、多个最优模型的ROC曲线进行对比分析

    pytyon计算多个模型在不同数据集上的预测概率.获取每个数据集上的最佳模型.多个最优模型的ROC曲线进行对比分析 目录

  5. 练习:科比数据集的处理和预测

    import pandas as pd #读取数据,并返回一个DataFrame对象 raw = pd.read_csv("F:\\skdata\\kobe.csv") #把剩余时 ...

  6. 酒店数据集退订分析与预测

    酒店数据集退订分析与预测 1.背景 2.提出问题 3.理解数据 3.1数据清洗 4.可视化分析 4.1 两家酒店总体退订率情况 4.2 退订用户特征 4.2.1用户需求属性 4.2.2 用户行为属性 ...

  7. 基于分布式的智联招聘数据的大屏可视化分析与预测

    项目需求分析及体系架构 1.1项目介绍 互联网成了海量信息的载体,目前是分析市场趋势.监视竞争对手或者获取销售线索的最佳场所,数据采集以及分析能力已成为驱动业务决策的关键技能.<计算机行业岗位招 ...

  8. python之AQI分析与预测

    AQI分析与预测 背景介绍 AQI,指空气质量指数,用来衡量空气清洁或污染的程度.值越小,表示空气质量越好. 分析目标 哪些城市的空气质量较好/较差? 空气质量在地理位置分布上,是否具有一定的规律性? ...

  9. AQI(Air Quality Index)分析与预测

    AQI(Air Quality Index)分析与预测 背景: 空气质量指数是用来衡量空气清洁或者污染的程度,值越小,表示空气质量越好;近年来,空气质量越来越受到人们的关注. 任务描述: 一.描述性统 ...

最新文章

  1. 计算机基本信息的获取
  2. linux c++ 调用matlab,ubuntu系统下C++调用matlab程序的方法详解
  3. 交错字符串Python解法
  4. 全网最新Spring Boot2.5.1整合Activiti5.22.0企业实战教程<网关篇>
  5. Spring对象绑定与类型转换
  6. 把数字随机分成 php,php随机数 微信随机生成红包金额算法php版
  7. 字符串%百分号 和 format 格式化
  8. WinError 126 asmjit.dll or one of its dependencies.
  9. mysqlplus 批量插入_解决SpringBoot+Druid+Mybatis Plus 执行MySQL批量插入,更新 报错的问题...
  10. 《离散数学及其应用》【张清华版】 第四章习题总结
  11. 计算机考研专业课数字,2020北京航空航天大学计算机考研初试专业课经验
  12. 为什么有些程序员明明很努力,但是却回报很低,收益很小,工资始终上不去-出自中华石杉老师
  13. 2022-10-11 myql-exists子查询外表关联记录
  14. Delta3D(6)教程:创建游戏角色-2
  15. ELK日志处理之Filebeat工作原理
  16. DCloud之APP离线SDK升级步骤(3.5.3升至最新版3.6.7.81556_20221018)
  17. 随手练——字符串按最小(大)字典序拼接
  18. 化学实验室改造方案怎么做?
  19. Java语法理论和面经杂疑篇《七. 数据结构与集合源码》
  20. arduino定时器pdf_Arduino基础入门篇18—数字时钟

热门文章

  1. 通过FinalShell连接AWS的EC2服务器
  2. 2022年最新江西机动车签字授权人模拟试题及答案
  3. 第九章计算机网络安全(完结撒花)
  4. 非计算机专业人员的程序之路
  5. 【英语】VOA60-second science听力
  6. 用python做生日礼物_利用python画一份素描合集,给女朋友一份独特生日礼物
  7. Win10下将Ubuntu16.04安装在移动固态硬盘上的若干问题
  8. python求二维数组的鞍点_C语言程序,找出一个二维数组的鞍点。
  9. mysql 判断时间是否当天_MySQL 获取当天日期
  10. 良好的协同管理,是数字时代的成功前提