科比数据集分析及预测

爱篮球，爱人工智能，爱生活。

探索性的对科比数据集进行分析

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inlinefrom sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import KFold
# import data
filename= "data.csv"
raw = pd.read_csv(filename)
print (raw.shape)
raw.head()

打印结果：

# 5000 for test
kobe =  raw[pd.notnull(raw['shot_made_flag'])]
print (kobe.shape)

打印结果：

(25697, 25)

#plt.subplot(211) first is raw second Column
alpha = 0.02
plt.figure(figsize=(10,10))# loc_x and loc_y
plt.subplot(121)
plt.scatter(kobe.loc_x, kobe.loc_y, color='R', alpha=alpha)
plt.title('loc_x and loc_y')# lat and lon
plt.subplot(122)
plt.scatter(kobe.lon, kobe.lat, color='B', alpha=alpha)
plt.title('lat and lon')

打印结果：

raw['dist'] = np.sqrt(raw['loc_x']**2 + raw['loc_y']**2)loc_x_zero = raw['loc_x'] == 0
#print (loc_x_zero)
raw['angle'] = np.array([0]*len(raw))
raw['angle'][~loc_x_zero] = np.arctan(raw['loc_y'][~loc_x_zero] / raw['loc_x'][~loc_x_zero])
raw['angle'][loc_x_zero] = np.pi / 2
raw['remaining_time'] = raw['minutes_remaining'] * 60 + raw['seconds_remaining']
print(kobe.action_type.unique())
print(kobe.combined_shot_type.unique())
print(kobe.shot_type.unique())
print(kobe.shot_type.value_counts())

打印结果：

['Jump Shot' 'Driving Dunk Shot' 'Layup Shot' 'Running Jump Shot''Reverse Dunk Shot' 'Slam Dunk Shot' 'Driving Layup Shot''Turnaround Jump Shot' 'Reverse Layup Shot' 'Tip Shot''Running Hook Shot' 'Alley Oop Dunk Shot' 'Dunk Shot''Alley Oop Layup shot' 'Running Dunk Shot' 'Driving Finger Roll Shot''Running Layup Shot' 'Finger Roll Shot' 'Fadeaway Jump Shot''Follow Up Dunk Shot' 'Hook Shot' 'Turnaround Hook Shot' 'Jump Hook Shot''Running Finger Roll Shot' 'Jump Bank Shot' 'Turnaround Finger Roll Shot''Hook Bank Shot' 'Driving Hook Shot' 'Running Tip Shot''Running Reverse Layup Shot' 'Driving Finger Roll Layup Shot''Fadeaway Bank shot' 'Pullup Jump shot' 'Finger Roll Layup Shot''Turnaround Fadeaway shot' 'Driving Reverse Layup Shot''Driving Slam Dunk Shot' 'Step Back Jump shot' 'Turnaround Bank shot''Reverse Slam Dunk Shot' 'Floating Jump shot' 'Putback Slam Dunk Shot''Running Bank shot' 'Driving Bank shot' 'Driving Jump shot''Putback Layup Shot' 'Putback Dunk Shot' 'Running Finger Roll Layup Shot''Pullup Bank shot' 'Running Slam Dunk Shot' 'Cutting Layup Shot''Driving Floating Jump Shot' 'Running Pull-Up Jump Shot' 'Tip Layup Shot''Driving Floating Bank Jump Shot']
['Jump Shot' 'Dunk' 'Layup' 'Tip Shot' 'Hook Shot' 'Bank Shot']
['2PT Field Goal' '3PT Field Goal']
2PT Field Goal    20285
3PT Field Goal     5412
Name: shot_type, dtype: int64

kobe['season'].unique()

打印结果：

array(['2000-01', '2001-02', '2002-03', '2003-04', '2004-05', '2005-06','2006-07', '2007-08', '2008-09', '2009-10', '2010-11', '2011-12','2012-13', '2013-14', '2014-15', '2015-16', '1996-97', '1997-98','1998-99', '1999-00'], dtype=object)

raw['season'] = raw['season'].apply(lambda x: int(x.split('-')[1]) )
raw['season'].unique()

打印结果：

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 97,98, 99,  0], dtype=int64)

pd.DataFrame({'matchup':kobe.matchup, 'opponent':kobe.opponent})

打印结果：

plt.figure(figsize=(5,5))plt.scatter(raw.dist, raw.shot_distance, color='blue')
plt.title('dist and shot_distance')

打印结果：

gs = kobe.groupby('shot_zone_area')
print (kobe['shot_zone_area'].value_counts())
print (len(gs))

打印结果：

Center(C)                11289
Right Side Center(RC)     3981
Right Side(R)             3859
Left Side Center(LC)      3364
Left Side(L)              3132
Back Court(BC)              72
Name: shot_zone_area, dtype: int64
6

import matplotlib.cm as cm
plt.figure(figsize=(20,10))def scatter_plot_by_category(feat):alpha = 0.1gs = kobe.groupby(feat)cs = cm.rainbow(np.linspace(0, 1, len(gs)))for g, c in zip(gs, cs):plt.scatter(g[1].loc_x, g[1].loc_y, color=c, alpha=alpha)# shot_zone_area
plt.subplot(131)
scatter_plot_by_category('shot_zone_area')
plt.title('shot_zone_area')# shot_zone_basic
plt.subplot(132)
scatter_plot_by_category('shot_zone_basic')
plt.title('shot_zone_basic')# shot_zone_range
plt.subplot(133)
scatter_plot_by_category('shot_zone_range')
plt.title('shot_zone_range')

打印结果：

drops = ['shot_id', 'team_id', 'team_name', 'shot_zone_area', 'shot_zone_range', 'shot_zone_basic', \'matchup', 'lon', 'lat', 'seconds_remaining', 'minutes_remaining', \'shot_distance', 'loc_x', 'loc_y', 'game_event_id', 'game_id', 'game_date']
for drop in drops:raw = raw.drop(drop, 1)
print (raw['combined_shot_type'].value_counts())
pd.get_dummies(raw['combined_shot_type'], prefix='combined_shot_type')[0:2]

打印结果：

categorical_vars = ['action_type', 'combined_shot_type', 'shot_type', 'opponent', 'period', 'season']
for var in categorical_vars:raw = pd.concat([raw, pd.get_dummies(raw[var], prefix=var)], 1)raw = raw.drop(var, 1)
train_kobe = raw[pd.notnull(raw['shot_made_flag'])]
train_label = train_kobe['shot_made_flag']
train_kobe = train_kobe.drop('shot_made_flag', 1)test_kobe = raw[pd.isnull(raw['shot_made_flag'])]
test_kobe = test_kobe.drop('shot_made_flag', 1)
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import confusion_matrix,log_loss
import time
import numpy as np
range_m = np.logspace(0,2,num=5).astype(int)
range_m
# find the best n_estimators for RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import KFoldprint('Finding best n_estimators for RandomForestClassifier...')
min_score = 100000
best_n = 0
scores_n = []
range_n = np.linspace(1,100,num=10).astype(int)
for n in range_n:print("the number of trees : {0}".format(n))t1 = time.time()rfc_score = 0.rfc = RandomForestClassifier(n_estimators=n)for train_k, test_k in KFold(len(train_kobe), n_folds=10, shuffle=True):rfc.fit(train_kobe.iloc[train_k], train_label.iloc[train_k])#rfc_score += rfc.score(train.iloc[test_k], train_y.iloc[test_k])/10pred = rfc.predict(train_kobe.iloc[test_k])rfc_score += log_loss(train_label.iloc[test_k], pred) / 10scores_n.append(rfc_score)if rfc_score < min_score:min_score = rfc_scorebest_n = nt2 = time.time()print('Done processing {0} trees ({1:.3f}sec)'.format(n, t2-t1))
print(best_n, min_score)# find best max_depth for RandomForestClassifier
print('Finding best max_depth for RandomForestClassifier...')
min_score = 100000
best_m = 0
scores_m = []
range_m = np.logspace(0,2,num=3).astype(int)
for m in range_m:print("the max depth : {0}".format(m))t1 = time.time()rfc_score = 0.rfc = RandomForestClassifier(max_depth=m, n_estimators=best_n)for train_k, test_k in KFold(len(train_kobe), n_folds=10, shuffle=True):rfc.fit(train_kobe.iloc[train_k], train_label.iloc[train_k])#rfc_score += rfc.score(train.iloc[test_k], train_y.iloc[test_k])/10pred = rfc.predict(train_kobe.iloc[test_k])rfc_score += log_loss(train_label.iloc[test_k], pred) / 10scores_m.append(rfc_score)if rfc_score < min_score:min_score = rfc_scorebest_m = mt2 = time.time()print('Done processing {0} trees ({1:.3f}sec)'.format(m, t2-t1))
print(best_m, min_score)

打印结果：

Finding best n_estimators for RandomForestClassifier...
the number of trees : 1
Done processing 1 trees (1.318sec)
the number of trees : 12
Done processing 12 trees (9.057sec)
the number of trees : 23
Done processing 23 trees (16.842sec)
the number of trees : 34
Done processing 34 trees (24.728sec)
the number of trees : 45
Done processing 45 trees (32.499sec)
the number of trees : 56
Done processing 56 trees (40.416sec)
the number of trees : 67
Done processing 67 trees (48.344sec)
the number of trees : 78
Done processing 78 trees (56.638sec)
the number of trees : 89
Done processing 89 trees (64.037sec)
the number of trees : 100
Done processing 100 trees (72.883sec)
67 11.840059155801537
Finding best max_depth for RandomForestClassifier...
the max depth : 1
Done processing 1 trees (5.090sec)
[0. 0. 0. ... 0. 0. 1.]
the max depth : 10
Done processing 10 trees (16.442sec)
[1. 1. 1. ... 0. 0. 0.]
the max depth : 100
Done processing 100 trees (49.503sec)
[0. 0. 1. ... 1. 1. 1.]
10 10.983861400622605

plt.figure(figsize=(10,5))
plt.subplot(121)
plt.plot(range_n, scores_n)
plt.ylabel('score')
plt.xlabel('number of trees')plt.subplot(122)
plt.plot(range_m, scores_m)
plt.ylabel('score')
plt.xlabel('max depth')

打印结果：

model = RandomForestClassifier(n_estimators=best_n, max_depth=best_m)
model.fit(train_kobe, train_label)

打印结果：

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',max_depth=10, max_features='auto', max_leaf_nodes=None,min_impurity_decrease=0.0, min_impurity_split=None,min_samples_leaf=1, min_samples_split=2,min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,oob_score=False, random_state=None, verbose=0,warm_start=False)

科比数据集分析及预测相关推荐

宝可梦数据集分析及预测
前言以下内容为本人学习过程中记录,仅用于学习,如有错误或者纰漏,请留言指正,谢谢. 数据集和代码下载 – 百度云链接:https://pan.baidu.com/s/1RFUEVcD85J2AQ3_ ...
机器学习实战之科比数据集分析（随机森林寻最优值参数）
文章目录总体思路分为三部 1.查看数据,对数据进行清洗,规约 1.1 查看数据 1.2 数据清洗,规约 1.3 删除不相关的特征 1.4 数据one-hot处理* 2.建立模型,挑选出最优参数 2. ...
NBA球星生涯数据集分析
源码链接: https://download.csdn.net/download/qq_58012062/87541713?spm=1001.2014.3001.5501 数据提取:链接:https: ...
python计算多个模型在不同数据集上的预测概率、获取每个数据集上的最优模型、多个最优模型的ROC曲线进行对比分析
pytyon计算多个模型在不同数据集上的预测概率.获取每个数据集上的最佳模型.多个最优模型的ROC曲线进行对比分析目录
练习：科比数据集的处理和预测
import pandas as pd #读取数据,并返回一个DataFrame对象 raw = pd.read_csv("F:\\skdata\\kobe.csv") #把剩余时 ...
酒店数据集退订分析与预测
酒店数据集退订分析与预测 1.背景 2.提出问题 3.理解数据 3.1数据清洗 4.可视化分析 4.1 两家酒店总体退订率情况 4.2 退订用户特征 4.2.1用户需求属性 4.2.2 用户行为属性 ...
基于分布式的智联招聘数据的大屏可视化分析与预测
项目需求分析及体系架构 1.1项目介绍互联网成了海量信息的载体,目前是分析市场趋势.监视竞争对手或者获取销售线索的最佳场所,数据采集以及分析能力已成为驱动业务决策的关键技能.<计算机行业岗位招 ...
python之AQI分析与预测
AQI分析与预测背景介绍 AQI,指空气质量指数,用来衡量空气清洁或污染的程度.值越小,表示空气质量越好. 分析目标哪些城市的空气质量较好/较差? 空气质量在地理位置分布上,是否具有一定的规律性? ...
AQI(Air Quality Index)分析与预测
AQI(Air Quality Index)分析与预测背景: 空气质量指数是用来衡量空气清洁或者污染的程度,值越小,表示空气质量越好;近年来,空气质量越来越受到人们的关注. 任务描述: 一.描述性统 ...

科比数据集分析及预测

科比数据集分析及预测相关推荐

最新文章

热门文章