导入相应的工具包

from keras.layers import Input, Dense
from keras.models import Model, Sequential
from keras import regularizers
from tensorflow.compat.v1.metrics import accuracy
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.manifold import TSNE   #cuml.manifold  windows系统再考虑
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import gridspec
import matplotlib.patches as mpatchesfrom tabulate import tabulate
%matplotlib inline

加载数据集

 dataset = pd.read_csv('./creditcard.csv')

绘图中文设置

plt.rcParams['font.sans-serif'] = 'SimHei'
plt.rcParams['axes.unicode_minus'] = False 

抽取列名、提取数据特征X、提取标签Y

columnas = ['Time']
for i in range(1,29):col = 'V' + str(i)columnas.append(col)
columnas.append('Amount')
y = dataset['Class']
X = pd.DataFrame(dataset, columns=columnas)

由于数据差别相对较大,对特征Time、Amount进行缩放

def normalize_data(X):scaler = StandardScaler()scaled_data = scaler.fit_transform(X)flat_list = [item for sublist in scaled_data.tolist() for item in sublist]scaled = pd.Series(flat_list)return scaleddef scaling(dataset):scaled_time = normalize_data(dataset[['Time']])scaled_amount = normalize_data(dataset[['Amount']])return scaled_time, scaled_amountscaled_time, scaled_amount = scaling(dataset)
dataset = pd.concat([dataset, scaled_amount.rename('scaled_amount'), scaled_time.rename('scaled_time')], axis=1)
# 删除旧的数值,它们将被按缩放后的数值所取代。
dataset.drop(['Amount', 'Time'], axis=1, inplace=True)
X = dataset.iloc[:, dataset.columns != 'Class']   #[284807 rows x 29 columns]
Y = dataset.iloc[:, dataset.columns == 'Class']   #[284807 rows x 1 columns]
print("经过数据缩放后的dataset")
print(dataset)
print("经过数据缩放后的X")
print (X)
print("经过数据缩放后的Y")
print (Y)
# 至此,dataset已经经过特征缩放处理   X,Y 也是经过处理  

原数据集正反样本的不平衡性力促我们去平衡数据,采用下采样(可考虑使用NearMiss下采样)或上采样(SMOTE或SMOTETomek)

def undersample(dataset):#随机下采样#所有欺诈数据记录dataset_frauddataset_fraud = dataset[dataset['Class'] == 1]dataset_fraud_x = dataset_fraud.drop(['Class'], axis = 1)dataset_fraud_y = dataset_fraud['Class']is_not_fraud = dataset['Class'] == 0# 从非欺诈数据中选取(采样)同欺诈数据同样多的数据    构成   欺诈数据:非欺诈数据=1:1dataset_not_fraud = dataset[is_not_fraud].sample(n=len(dataset_fraud_x), replace = True, random_state = 1)dataset_not_fraud_x = dataset_not_fraud.drop(['Class'], axis=1)dataset_not_fraud_y = dataset_not_fraud['Class']
#     print(dataset_not_fraud_x)
#     print(dataset_fraud_x)#Concatenar
#     X = np.concatenate((dataset_not_fraud_x, dataset_fraud_x))
#     y = np.concatenate((dataset_not_fraud_y, dataset_fraud_y))X_under = pd.concat([dataset_not_fraud_x, dataset_fraud_x])y_under = pd.concat([dataset_not_fraud_y, dataset_fraud_y])fraud = pd.concat([dataset_not_fraud_x, dataset_fraud_x])not_fraud = pd.concat([dataset_not_fraud_y, dataset_fraud_y])dataset = pd.concat([fraud, not_fraud], axis = 1)return X_under, y_under, datasetX_under,y_under,dataset_under = undersample(dataset)
print("下采样之后的dataset_under,数量应该是 984 rows x 31 columns  ")
print(dataset_under)
print("下采样之后的X_under,数量应该是 984 rows x 30 columns")
print(X_under)
print("下采样之后的y_under,数量应该是 984 rows x 1 columns ")
print(y_under)# 至此,下采样的数据已经全部取得  X_under,y_under,dataset_under

至此,数据特征缩放、才用下采样整体正反样例也达到了平衡,可通过可视化查看

#下采样之前的数据
fig = plt.figure(figsize=(10,8))
sns.countplot('Class', data=dataset)
plt.title('Distribucion Uniforme de las clases')
plt.xlabel('Clase (0: No fraudulento, 1: Fraudulento)')
plt.ylabel('Conteo')
plt.savefig("dataset.jpg",dpi=1400)
plt.show()
#下采样之后的数据
fig = plt.figure(figsize=(10,8))
sns.countplot('Class', data=dataset_under)
plt.title('Distribucion Uniforme de las clases')
plt.xlabel('Clase (0: No fraudulento, 1: Fraudulento)')
plt.ylabel('Conteo')
plt.savefig("undersample.jpg",dpi=1400)
plt.show()

数据降维,原本数据特征数 Time,V1-V28,Amount,  加一个分类标签Class

#数据降维
def reduce_dimensionality(X):#n_components:嵌入空间的维度 # X是特征,不包含targetX = TSNE(n_components=2, random_state=1).fit_transform(X)return Xdef plot(X, y):plt.figure() f, ax = plt.subplots(figsize=(24,16))blue_patch = mpatches.Patch(color='blue', label='No Fraud')red_patch = mpatches.Patch(color='red', label='Fraud')ax.tick_params(axis='both', which='major', labelsize=20)ax.tick_params(axis='both', which='minor', labelsize=20)ax.scatter(X[:,0], X[:,1], c=(y == 0),cmap='coolwarm', label='No Fraud', linewidths=2)ax.scatter(X[:,0], X[:,1],c=(y == 1) ,cmap='coolwarm', label='Fraud', linewidths=2)ax.set_title('t-SNE', fontsize=30)ax.grid(True)#添加网格ax.legend(handles=[blue_patch, red_patch],fontsize=18)#增加图例

在下采样数据集上进行数据降维,把下采样训练集X_under降至二维

x_under_reduced = reduce_dimensionality(X_under)
print(x_under_reduced)
plot(x_under_reduced, y_under)

划分数据集(原始数据集、下采样数据集、过采样)

from imblearn.combine import SMOTETomek
from collections import CounterX_train, X_test, y_train, y_test=train_test_split(X, Y, test_size = 0.25, random_state = 1, shuffle=True)#划分原始数据集
X_under_train, X_under_test, y_under_train, y_under_test=train_test_split(X_under, y_under, test_size = 0.25, random_state = 1, shuffle=True)#划分下采样数据集
print(len(y_train[y_train.Class == 1]))
print(len(y_train[y_train.Class == 0]))
print(Counter(y_train))print(Counter(X_train))print(Counter(X_under_train))print(Counter(y_under_train))#上采样print(len(X_train))
print(len(y_train[y_train.Class == 1]))
print(len(y_train[y_train.Class == 0]))
os=SMOTETomek(random_state=42)
X_train_ns,y_train_ns=os.fit_resample(X_train,y_train)print(len(X_train_ns))
print(len(y_train_ns[y_train_ns.Class == 1]))
print(len(y_train_ns[y_train_ns.Class == 0]))

准备封装一些函数,混淆矩阵、模型量化值计算(acc、pre、rec、f1、auc)、模型训练封装、模型列表

# 定义混淆矩阵
def draw_matrix_confusion(yTest, yPred, subplot, name, measures):print(measures)LABELS = ['No Fraude', 'Fraude'] conf_matrix = metrics.confusion_matrix(yTest, yPred) sns.set(font_scale=2)ax = plt.subplot(3, 2, subplot)sns.heatmap(conf_matrix, xticklabels = LABELS,  yticklabels = LABELS, annot = True, fmt ="d"); plt.title(name, fontsize=20, fontweight='bold') plt.ylabel('Clase verdadera', fontsize=20, fontweight='bold') plt.xlabel('Clase predicha\nROC-AUC: {}'.format(measures[-1]), fontsize=20, fontweight='bold') return conf_matrix
# 计算量化值
def calculate_measures(y_test, yPred):acc = metrics.accuracy_score(y_test, yPred)  prec = metrics.precision_score(y_test, yPred, average='macro')   #average='micro'   这样 三个会一样rec = metrics.recall_score(y_test, yPred, average='macro')#average='micro'f1 = metrics.f1_score(y_test, yPred, average='weighted', labels=np.unique(yPred))roc_auc = metrics.roc_auc_score(y_test, yPred)#MCC = metrics.matthews_corrcoef(y_test, yPred)return acc, prec, rec, f1, roc_auc
# 模型训练
def make_machine_learning(X_train, X_test, y_train, y_test,classifier, auto_enc = False):
#     print("====")
#     print(len(y_train))if isinstance(classifier, Model):classifier.fit(X_train, y_train, batch_size = 256, epochs = 100, shuffle = True, validation_split = 0.20)predictions = classifier.predict(X_test)mse = np.mean(np.power(X_test - predictions, 2), axis=1)  #求各行的平均值
#         print(mse)error_df = pd.DataFrame({'reconstruction_error': mse,'true_class': y_test})threshold = 1.68yPred = [1 if e > threshold else 0 for e in error_df.reconstruction_error.values]else:
#         print(y_train)classifier.fit(X_train, y_train)yPred = classifier.predict(X_test)measures = calculate_measures(y_test, yPred)return measures, yPred
def train(X_train, X_test, y_train, y_test):fig = plt.figure(figsize =(26, 20))fig.suptitle("matrix", fontsize=30, fontweight='bold')performance_measures = []   i = 1for name, classifier in algs:tup = []print("用该技术训练模型" + name)
#         print(len(y_train))measures, yPred = make_machine_learning(X_train, X_test, y_train, y_test,classifier)tup.append(name)for mea in measures:tup.append(mea)performance_measures.append(tup)print("绘制混淆矩阵" + name)confusion_matrix = draw_matrix_confusion(y_test, yPred, i, name, measures)i += 1print('\n')fig.tight_layout(pad=3.0)plt.show()print('')print(tabulate(performance_measures, headers=['Method', 'Accuracy', 'Precision', 'Recall', 'f1 Score', 'ROC AUC']))

神经网络、随机森林(属于集成模型)、K-近邻、支持向量机、逻辑回归

algs = [['Neural Network', MLPClassifier(hidden_layer_sizes=(3,1,), activation = 'relu',solver='adam', alpha = 1, max_iter = 2000, random_state = 1)],['Random Forest', RandomForestClassifier(n_jobs=-1)],['KNN', KNeighborsClassifier(n_neighbors=3)],['Support Machine Vector', OneVsRestClassifier(LinearSVC(random_state=0, tol=1e-05), n_jobs=-1)],['Logistic Regression', LogisticRegression(penalty='l2', solver = 'lbfgs', random_state = 0, max_iter=1000)]]

下采样是聚集上进行训练测试

train(X_under_train, X_under_test, y_under_train, y_under_test)

结果如下:

再考虑使用下采样测试集的训练模型测试原始数据集(希望有人指导一下 这个地方为什么precision这么低)

train(X_under_train, X_test, y_under_train, y_test)

过采样数据集训练,原始测试集测试

train(X_train_ns, X_test, y_train_ns, y_test)

最后再加一个模型吧   CatBoost

SEED=1234
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report
credit_catboost_ready_df=dataset.dropna()#删除所有包含NaN的行,相当于参数全部默认features=[feat for feat in list(credit_catboost_ready_df) if feat !='Class']card_categories= np.where(credit_catboost_ready_df[features].dtypes != float)[0]params={'iterations':5000,'learning_rate':0.01,'cat_features':card_categories,'depth':3,'eval_metric':'AUC','verbose':200,'od_type':"Iter",'od_wait':500,
#         'early_stopping_rounds': 200,'random_seed':SEED,'task_type':"GPU" ,}

下采样数据集上测试

cat_model = CatBoostClassifier(**params) #,task_type="GPU"
#下采样上测试
#输入数据   eval_set=None: 验证集合   plot=False: 训练过程中,绘制,度量值,所用时间等
cat_model.fit(X_under_train,y_under_train,eval_set=(X_under_test,y_under_test),use_best_model=True,plot=True);y_pred = cat_model.predict(X_under_test)print(classification_report(y_under_test,y_pred))
acc, prec, rec, f1, roc_auc=calculate_measures(y_under_test,y_pred)print(acc)
print(prec)
print(rec)
print(f1)
print(roc_auc)#0.926829268292683
#0.9290254237288136
#0.930622009569378
#0.9268969965732714
#0.930622009569378

下采样测试集训练 ,原始测试集测试

cat_model.fit(X_under_train,y_under_train,eval_set=(X_under_test,y_under_test),use_best_model=True,plot=True);y_pred = cat_model.predict(X_test)
print(classification_report(y_under_test,y_pred))
acc, prec, rec, f1, roc_auc=calculate_measures(y_test,y_pred)print(acc)
print(prec)
print(rec)
print(f1)
print(roc_auc)#0.9858150051964832
#0.5450374769919609
#0.9389267733361922
#0.9915545517299706
#0.9389267733361922

基于creditcard数据集的经典模型性能比较相关推荐

  1. 寺冈labelnet使用说明_基于imagenet数据集的ResNet50模型训练示例

    基于imagenet数据集的ResNet50模型训练示例 训练前准备 数据集获取 本训练示例以imagenet数据集为例,从imagenet官方网站http://www.image-net.org/获 ...

  2. 数据集大小与模型性能的敏感性分析

    [翻译自 : sensitivity-analysis-of-dataset-size-vs-model-performance] [说明:Jason Brownlee PhD大神的文章个人很喜欢,所 ...

  3. 基于双语数据集搭建seq2seq模型

    目录 一.前言 二.数据预处理 2.1 数据清洗 2.2 词元化 2.3 建立词表 2.4 数据加载 2.5 构建数据集 三.模型搭建 3.1 Encoder-Decoder 架构 3.2 Encod ...

  4. 【深度学习】近几年,关于基于Imagenet数据集图像分类的模型总结

    「@Author:Runsen」 在过去的几年里,许多深度学习模型涌现出来,例如层的类型.超参数等.在本系列中,我将回顾几个最显着的 deeplearn 图像分类的模型. AlexNet (2012 ...

  5. 近几年,关于基于Imagenet数据集图像分类的模型总结

    @Author:Runsen 在过去的几年里,许多深度学习模型涌现出来,例如层的类型.超参数等.在本系列中,我将回顾几个最显着的 deeplearn 图像分类的模型. 文章目录 AlexNet (20 ...

  6. mmdetection3d基于kitti数据集训练pointpillars模型

    当mmdetection3d环境安装成功后,可看上一篇如何安装mmdetection3d mmdetection3d官网:Log Analysis - MMDetection3D 1.0.0rc1 d ...

  7. 基于matlab的COST231-hata信道模型性能仿真

    目录 1.算法概述 2.仿真效果 3.MATLAB仿真源码 1.算法概述 COST-231Hata模型是EURO-COST组成的COST工作委员会开发的Hata模型的扩展版本,应用频率在1500~20 ...

  8. ML之xgboostGBM:基于xgboostGBM算法对HiggsBoson数据集(Kaggle竞赛)训练(两模型性能PK)实现二分类预测

    ML之xgboost&GBM:基于xgboost&GBM算法对HiggsBoson数据集(Kaggle竞赛)训练(两模型性能PK)实现二分类预测 目录 输出结果 设计思路 核心代码 输 ...

  9. ML之K-means:基于DIY数据集利用K-means算法聚类(测试9种不同聚类中心的模型性能)

    ML之K-means:基于DIY数据集利用K-means算法聚类(测试9种不同聚类中心的模型性能) 目录 输出结果 设计思路 实现代码 输出结果 设计思路 1.使用均匀分布函数随机三个簇,每个簇周围1 ...

最新文章

  1. JavaScript内置一些方法的实现原理--new关键字,call/apply/bind方法--前戏
  2. Java连接数据库(2)
  3. 选择一线一张床还是小城一套房?
  4. PHP函数之HTMLSPECIALCHARS_DECODE
  5. RuntimeError: DataLoader worker (pid(s) 13512, 280, 21040) exited unexpectedly
  6. npm 运行报错“Cannot find module ‘@vue/component-compiler-utils/package.json‘”
  7. [原创]如何在Windows XP 中利用监视计算机中的资源使用情况
  8. ubuntu 中 notepad 安装
  9. J-LINK烧录bin文件
  10. 概率统计学习笔记(9)——连续型:均匀分布、指数分布
  11. 华东理工大学计算机考研资料汇总
  12. vue-quill-editor 上传视频
  13. 程序员副业赚钱之道,实现月收入增加20K
  14. 华为RH2288H V3服务器 从拆箱开始安装系统
  15. 计算机CD_ROM表示中文,CD-ROM是指什么?
  16. Crunch生成字典
  17. mysql InnoDB存储引擎的介绍
  18. CUDA+VS2017+win环境下 cuda工程环境搭建(解决标识符未定义或申明)
  19. Android 内存泄露分析
  20. 我的通宵史-网上斗地主谋生

热门文章

  1. python数据分析与数据化运营_电商数据分析与数据化运营.pdf
  2. 我35岁,程序员,“中危”后帮别人规划人生
  3. JAVA的日常小记录(MyBatis-Plus)
  4. 数据架构的本质到底是什么 by 傅一平
  5. 走投无路的算法学习笔记|Day002
  6. java若依框架开发api接口(添加新模块)
  7. Android Wi-Fi SSR功能
  8. python离线_python离线神器
  9. fun(n) c语言,阅读以下函数说明和C语言函数,将应填入(n)的字句写在答题纸的对应栏内。 [说明1] 函数int fun1(in - 赏学吧...
  10. 《缠中说禅108课》75:逗庄家玩的一些杂史 1