SKLearn 信用卡欺诈检测(creditcard)

creditcard.csv文件可以在CSDN下载，

https://download.csdn.net/download/bbqqlover/10179806

两个common func的代码，

_common_func.py

'''
Created on 2020年4月8日@author: Lenovo
'''
import numpy as np
import pandas as pd# 引入逻辑回归模型
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import recall_score
# 引入pyplot模块
import matplotlib.pyplot as plt
import itertools# 交叉训练，并返回最佳C值的函数
def print_KFold_scores(x_train_data, y_train_data):fold = KFold(n_splits=5, shuffle=False)c_param_range = [0.01, 0.1, 1, 10, 100]# 构建一个两列， index为空的Dataframeresults_table = pd.DataFrame(index = range(len(c_param_range),2),columns = ['C_parameter','Mean recall score'])results_table['C_parameter'] = c_param_rangej = 0# 使用多个惩罚系数进行交叉验证for c_param in c_param_range:print("-------------------------------------------------------")print('C Parameter:', c_param)print("-------------------------------------------------------")print('')recall_accs = list()it = 0for train_idx, test_idx in fold.split(x_train_data):# 实例化逻辑回归模型， 使用 L1正则化lr = LogisticRegression(C = c_param, penalty = 'l1',solver='liblinear')# 给逻辑回归模型喂 训练集数据lr.fit(x_train_data.iloc[train_idx, :], y_train_data.iloc[train_idx, :].values.ravel())# 对验证集 进行预测, 返回预测标签y_pred_undersample = lr.predict(x_train_data.iloc[test_idx, :].values)# 对交叉验证的某一轮（共5轮），计算recallrecall_acc = recall_score(y_train_data.iloc[test_idx, :].values, y_pred_undersample)recall_accs.append(recall_acc)# 打印交叉验证 当前这一轮的recallprint(f"Iteration: {it}, recall = {recall_acc}")it += 1# 计算某一惩罚力度下的平均recall，并打印results_table.loc[j, "Mean recall score"] = np.mean(recall_accs)j += 1print('')print('Mean recall score ', np.mean(recall_accs))print('')# 让recall值最大的最好的C 参数值best_c = results_table.loc[results_table["Mean recall score"].astype(float).idxmax()]['C_parameter']print("*******************************************************************************")print("最佳C参数模型 = ", best_c)print("*******************************************************************************")return best_c# 打印混淆矩阵的函数
def plot_confusion_matrix(cm, classes, title="Confusion Matrix", cmap=plt.cm.Blues):'''打印混淆矩阵的函数:param cm: 混淆矩阵:param classes: 类别:param title: 画布标题:param cmap: 颜色表'''plt.imshow(cm, interpolation='nearest', cmap=cmap)plt.title(title)plt.colorbar()# 打印横轴和纵轴坐标tick_marks = np.arange(len(classes))plt.xticks(tick_marks, classes, rotation=0)plt.yticks(tick_marks, classes)# 限值， confusion map的最大值除以2,取浮点数thresh = cm.max() / 2.# 打印文本和颜色，将Confusion Matrix转秩打印for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):plt.text(j, i, cm[i, j],horizontalalignment="center",color="white" if cm[i, j] > thresh else "black")plt.tight_layout()# 打印x轴和y轴的labelplt.ylabel('True label')plt.xlabel('Predicted label')

加载数据，1_loaddata.py

'''
Created on 2020年4月7日@author: Lenovo
'''from sklearn.preprocessing import StandardScalerfrom const import CSV_PATH
import pandas as pd# 加载数据
data = pd.read_csv(CSV_PATH)
# 查看数据概览
print(data.shape)
print(data.head())
# 按Class进行groupBy，查看每种class有多少个
count_class = pd.value_counts(data['Class'], sort=True).sort_index()
print(count_class)# Amount特征值太大，进行正规化
data['normAmount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))
# 删除无用的Time列和原 Amount列
data = data.drop(['Time', 'Amount'], axis=1)
print(data.head())

程序输出如下，0有28万个，1只有不到500个，数据不均衡。

下采样训练代码，2_downsample_train.py

'''
Created on 2020年4月8日@author: Lenovo
'''
import numpy as npfrom sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from card._common_func import print_KFold_scoresfrom const import CSV_PATH
import pandas as pd# 忽略SKLearn警告
# import warnings
# warnings.filterwarnings('ignore')
# 下采样示例# 加载数据
data = pd.read_csv(CSV_PATH)# Amount特征值太大，进行正规化
data['normAmount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))
# 删除无用的Time列和原 Amount列
data = data.drop(['Time', 'Amount'], axis=1)# 获取特征列,所有行，列名不是Class的列
X = data.loc[:, data.columns != 'Class']# 获取标签列，所有行，列名是Class的列
y = data.loc[:, data.columns == 'Class']# 统计异常样本个数
num_records_fraud = len(data[data.Class == 1])
print(num_records_fraud)
# 异常样本索引列表
fraud_indexes = np.array(data[data.Class == 1].index)
# 随机抽取与异常样本数量相当的正常样本索引
normal_indexes = data[data.Class == 0].index
random_normal_indexes = np.random.choice(normal_indexes, num_records_fraud, replace=False)
random_normal_indexes = np.array(random_normal_indexes)# 将下采样的正常和异常样本 索引合并
under_sample_index = np.concatenate([fraud_indexes, random_normal_indexes])
# 在总样本中抽取下采样样本
under_sample_data = data.iloc[under_sample_index, :]# 下采样特征
X_under_sample = under_sample_data.iloc[:, under_sample_data.columns != 'Class']
# 下采样标签
y_under_sample = under_sample_data.iloc[:, under_sample_data.columns == 'Class']# 异常集 占 总集合的 百分比
print(len(under_sample_data[under_sample_data.Class == 1])/ len(under_sample_data))# 将全集 分割成 训练集和测试集， 比例为 7:3
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3,random_state = 0) # 将下采样集合 分割成 训练集和测试集， 比例为 7:3
X_undersample_train, X_undersample_test, y_undersample_train, y_undersample_test = train_test_split(X_under_sample, y_under_sample, test_size=0.3, random_state=0)# 使用逻辑 回归模型 训练，并打印下采样的最佳 惩罚参数
best_c = print_KFold_scores(X_undersample_train, y_undersample_train)

程序输出如下，

可见经过5轮参数选择后[0.01, 0.1, 1, 10, 100]，最佳参数是 0.01

使用下采样得到的最佳C参数，预测下采样测试集，绘制混淆矩阵

3_downsample_predict.py

'''
Created on 2020年4月8日@author: Lenovo
'''
from sklearn.linear_model import LogisticRegression
from sklearn.metrics._classification import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScalerfrom card._common_func import plot_confusion_matrix
from const import CSV_PATH
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd# 引入逻辑回归模型
# 加载数据
data = pd.read_csv(CSV_PATH)# Amount特征值太大，进行正规化
data['normAmount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))
# 删除无用的Time列和原 Amount列
data = data.drop(['Time', 'Amount'], axis=1)# 获取特征列,所有行，列名不是Class的列
X = data.loc[:, data.columns != 'Class']# 获取标签列，所有行，列名是Class的列
y = data.loc[:, data.columns == 'Class']# 统计异常样本个数
num_records_fraud = len(data[data.Class == 1])
print(num_records_fraud)
# 异常样本索引列表
fraud_indexes = np.array(data[data.Class == 1].index)
# 随机抽取与异常样本数量相当的正常样本索引
normal_indexes = data[data.Class == 0].index
random_normal_indexes = np.random.choice(normal_indexes, num_records_fraud, replace=False)
random_normal_indexes = np.array(random_normal_indexes)# 将下采样的正常和异常样本 索引合并
under_sample_index = np.concatenate([fraud_indexes, random_normal_indexes])
# 在总样本中抽取下采样样本
under_sample_data = data.iloc[under_sample_index, :]# 下采样特征
X_under_sample = under_sample_data.iloc[:, under_sample_data.columns != 'Class']
# 下采样标签
y_under_sample = under_sample_data.iloc[:, under_sample_data.columns == 'Class']# 将下采样集合 分割成 训练集和测试集， 比例为 7:3
X_undersample_train, X_undersample_test, y_undersample_train, y_undersample_test = train_test_split(X_under_sample, y_under_sample, test_size=0.3, random_state=0)# 下采样预测， 预测下采样测试集
lr = LogisticRegression(C = 0.01, penalty = 'l1', solver='liblinear')
# 为模型喂入下采样训练集数据
lr.fit(X_undersample_train, y_undersample_train.values.ravel())
# 对下采样测试集进行预测
y_pred_undersample = lr.predict(X_undersample_test)# 使用gt label和predict label生成混淆矩阵
cnf_matrix = confusion_matrix(y_undersample_test,y_pred_undersample)
# 设置numpy打印精度为2位小数
np.set_printoptions(precision=2)# 使用混淆矩阵计算recall TP/ (TP + FN)
print("Recall metric in the testing dataset: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))# 打印混淆矩阵
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names, title='Confusion matrix')
plt.show()

程序输出如下，可以看出，此次预测，数据量较小，虽然recall达到了 0.91，但是由于FP值较多，精度不高，不足0.9

使用下采样最佳C参数预测全测试集，4_downsample_predict_all.py

'''
Created on 2020年4月8日@author: Lenovo
''''''
Created on 2020年4月8日@author: Lenovo
'''
from sklearn.linear_model import LogisticRegression
from sklearn.metrics._classification import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScalerfrom card._common_func import plot_confusion_matrix
from const import CSV_PATH
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd# 使用下采样模型 预测所有测试集，FP比较多，精度较低 # 加载数据
data = pd.read_csv(CSV_PATH)# Amount特征值太大，进行正规化
data['normAmount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))
# 删除无用的Time列和原 Amount列
data = data.drop(['Time', 'Amount'], axis=1)# 获取特征列,所有行，列名不是Class的列
X = data.loc[:, data.columns != 'Class']# 获取标签列，所有行，列名是Class的列
y = data.loc[:, data.columns == 'Class']# 统计异常样本个数
num_records_fraud = len(data[data.Class == 1])
print(num_records_fraud)
# 异常样本索引列表
fraud_indexes = np.array(data[data.Class == 1].index)
# 随机抽取与异常样本数量相当的正常样本索引
normal_indexes = data[data.Class == 0].index
random_normal_indexes = np.random.choice(normal_indexes, num_records_fraud, replace=False)
random_normal_indexes = np.array(random_normal_indexes)# 将下采样的正常和异常样本 索引合并
under_sample_index = np.concatenate([fraud_indexes, random_normal_indexes])
# 在总样本中抽取下采样样本
under_sample_data = data.iloc[under_sample_index, :]# 下采样特征
X_under_sample = under_sample_data.iloc[:, under_sample_data.columns != 'Class']
# 下采样标签
y_under_sample = under_sample_data.iloc[:, under_sample_data.columns == 'Class']# 获取全集的 训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)# 将下采样集合 分割成 训练集和测试集， 比例为 7:3
X_undersample_train, X_undersample_test, y_undersample_train, y_undersample_test = train_test_split(X_under_sample, y_under_sample, test_size=0.3, random_state=0)# 下采样预测， 预测下采样测试集
lr = LogisticRegression(C = 0.01, penalty = 'l1', solver='liblinear')
# 为模型喂入下采样训练集数据
lr.fit(X_undersample_train, y_undersample_train.values.ravel())
# 对全部测试集进行预测
y_pred = lr.predict(X_test.values)# 使用gt label和predict label生成混淆矩阵
cnf_matrix = confusion_matrix(y_test,y_pred)
# 设置numpy打印精度为2位小数
np.set_printoptions(precision=2)# 使用混淆矩阵计算recall TP/ (TP + FN)
print("Recall metric in the testing dataset: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))# 打印混淆矩阵
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names, title='Confusion matrix')
plt.show()

程序输出如下，Recall 0.93，精度惨不忍睹， FP太多

下采样C参数thresold限值设置。之前是直接输出0，1类别，此处输出概率，概率大于限值，认为是正类，概率小于限值，认为是负类。

5_downsample_threshold.py

'''
Created on 2020年4月8日@author: Lenovo
'''
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics._classification import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as pltfrom card._common_func import plot_confusion_matrix
from const import CSV_PATH# 加载数据
data = pd.read_csv(CSV_PATH)# Amount特征值太大，进行正规化
data['normAmount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))
# 删除无用的Time列和原 Amount列
data = data.drop(['Time', 'Amount'], axis=1)# 获取特征列,所有行，列名不是Class的列
X = data.loc[:, data.columns != 'Class']# 获取标签列，所有行，列名是Class的列
y = data.loc[:, data.columns == 'Class']# 统计异常样本个数
num_records_fraud = len(data[data.Class == 1])
print(num_records_fraud)
# 异常样本索引列表
fraud_indexes = np.array(data[data.Class == 1].index)
# 随机抽取与异常样本数量相当的正常样本索引
normal_indexes = data[data.Class == 0].index
random_normal_indexes = np.random.choice(normal_indexes, num_records_fraud, replace=False)
random_normal_indexes = np.array(random_normal_indexes)# 将下采样的正常和异常样本 索引合并
under_sample_index = np.concatenate([fraud_indexes, random_normal_indexes])
# 在总样本中抽取下采样样本
under_sample_data = data.iloc[under_sample_index, :]# 下采样特征
X_under_sample = under_sample_data.iloc[:, under_sample_data.columns != 'Class']
# 下采样标签
y_under_sample = under_sample_data.iloc[:, under_sample_data.columns == 'Class']# 获取全集的 训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)# 将下采样集合 分割成 训练集和测试集， 比例为 7:3
X_undersample_train, X_undersample_test, y_undersample_train, y_undersample_test = train_test_split(X_under_sample, y_under_sample, test_size=0.3, random_state=0)# 下采样限值演示
lr = LogisticRegression(C = 0.01, penalty = 'l1', solver='liblinear')
# 喂入下采样训练集数据
lr.fit(X_undersample_train, y_undersample_train.values.ravel())# 预测下采样测试集元素出现的概率，（原先返回1，0， 现在返回概率）
y_pred_undersample_proba = lr.predict_proba(X_undersample_test.values)
np.set_printoptions(precision=2)# 概率大于相应限值，说明是正例， 小于相应限值， 说明是反例
thresholds = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]j = 1for i in thresholds:# 大于限值转换成1， 小于限值，转换成0， 转换成n行1列y_test_predictions = y_pred_undersample_proba[:,1] > i# 三行三列，第j副子图plt.subplot(3,3,j)j += 1# 计算混淆矩阵cnf_matrix = confusion_matrix(y_undersample_test, y_test_predictions)np.set_printoptions(precision=2)# 打印当前recallprint("Recall metric in the testing dataset: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))# 显示混淆矩阵图class_names = [0,1]plot_confusion_matrix(cnf_matrix, classes=class_names, title='Threshold >= %s'%i) plt.show()

程序输出如下，可以看出，将threshold设置成0.5， 0.6时，FP和FN都比较小，此时precison和recall达到平衡。再更大时，FN增加，recall降低。

全集训练，寻找best-c，不做采样

6_all_sample_train.py

'''
Created on 2020年4月8日@author: Lenovo
''''''
Created on 2020年4月8日@author: Lenovo
'''
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScalerfrom card._common_func import print_KFold_scores
from const import CSV_PATH
import numpy as np
import pandas as pd# 使用所有训练集 寻找 最佳C参数， 发现样本不均衡 时， recall较低# 引入逻辑回归模型
# 加载数据
data = pd.read_csv(CSV_PATH)# Amount特征值太大，进行正规化
data['normAmount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))
# 删除无用的Time列和原 Amount列
data = data.drop(['Time', 'Amount'], axis=1)# 获取特征列,所有行，列名不是Class的列
X = data.loc[:, data.columns != 'Class']# 获取标签列，所有行，列名是Class的列
y = data.loc[:, data.columns == 'Class']# 统计异常样本个数
num_records_fraud = len(data[data.Class == 1])
print(num_records_fraud)
# 异常样本索引列表
fraud_indexes = np.array(data[data.Class == 1].index)
# 随机抽取与异常样本数量相当的正常样本索引
normal_indexes = data[data.Class == 0].index
random_normal_indexes = np.random.choice(normal_indexes, num_records_fraud, replace=False)
random_normal_indexes = np.array(random_normal_indexes)# 将下采样的正常和异常样本 索引合并
under_sample_index = np.concatenate([fraud_indexes, random_normal_indexes])
# 在总样本中抽取下采样样本
under_sample_data = data.iloc[under_sample_index, :]# 下采样特征
X_under_sample = under_sample_data.iloc[:, under_sample_data.columns != 'Class']
# 下采样标签
y_under_sample = under_sample_data.iloc[:, under_sample_data.columns == 'Class']# 获取全集的 训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)# 将下采样集合 分割成 训练集和测试集， 比例为 7:3
X_undersample_train, X_undersample_test, y_undersample_train, y_undersample_test = train_test_split(X_under_sample, y_under_sample, test_size=0.3, random_state=0)best_c = print_KFold_scores(X_train, y_train)

程序输出如下，

可以看出，因为样本不均衡，全量训练时，recall较低，不可采用

使用smote算法过采样，寻找最佳C参数

7_smote_train.py

'''
Created on 2020年4月8日@author: Lenovo
'''
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScalerfrom card._common_func import print_KFold_scores
from imblearn.over_sampling import SMOTE
from const import CSV_PATH
import pandas as pd# 过采样， 使用 smote算法生成样本
# 引入逻辑回归模型
# 加载数据
data = pd.read_csv(CSV_PATH)# Amount特征值太大，进行正规化
data['normAmount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))
# 删除无用的Time列和原 Amount列
data = data.drop(['Time', 'Amount'], axis=1)# 获取特征列,所有行，列名不是Class的列
features = data.loc[:, data.columns != 'Class']# 获取标签列，所有行，列名是Class的列
labels = data.loc[:, data.columns == 'Class']# 分离训练集和测试集
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.2, random_state=0)# 使用SMOTE算法进行过采样
oversampler = SMOTE(random_state=0)
os_features, os_labels = oversampler.fit_resample(features_train, labels_train)
os_features = pd.DataFrame(os_features)
os_labels = pd.DataFrame(os_labels)
# 寻找best_c C参数
best_c = print_KFold_scores(os_features,os_labels)

输出如下，可以看出过采样最佳C参数为100，

使用过采样模型预测所有过采样测试集，

8_smote_predict.py

'''
Created on 2020年4月8日@author: Lenovo
'''
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics._classification import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScalerfrom card._common_func import plot_confusion_matrix
from const import CSV_PATH
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd# 过采样， 使用最佳C参数预测所有测试集
# 加载数据
data = pd.read_csv(CSV_PATH)# Amount特征值太大，进行正规化
data['normAmount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))
# 删除无用的Time列和原 Amount列
data = data.drop(['Time', 'Amount'], axis=1)# 获取特征列,所有行，列名不是Class的列
features = data.loc[:, data.columns != 'Class']# 获取标签列，所有行，列名是Class的列
labels = data.loc[:, data.columns == 'Class']# 分离训练集和测试集
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.2, random_state=0)# 使用SMOTE算法进行过采样
oversampler = SMOTE(random_state=0)
os_features, os_labels = oversampler.fit_resample(features_train, labels_train)
os_features = pd.DataFrame(os_features)
os_labels = pd.DataFrame(os_labels)lr = LogisticRegression(C = 100, penalty = 'l1', solver='liblinear')
# 把所有过采样数据集都喂进去
lr.fit(os_features,os_labels.values.ravel())
# 预测测试集
y_pred = lr.predict(features_test.values)# 计算混淆矩阵
cnf_matrix = confusion_matrix(labels_test,y_pred)
np.set_printoptions(precision=2)# 打印recall
print("Recall metric in the testing dataset: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))# 绘制混淆矩阵
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names, title='Confusion matrix')
plt.show()

程序输出如下，

可以看到，使用过采样准确率和召回率都达到了一个比较好的值。

SKLearn 信用卡欺诈检测(creditcard)相关推荐

机器学习项目实战----信用卡欺诈检测(二)
六.混淆矩阵: 混淆矩阵是由一个坐标系组成的,有x轴以及y轴,在x轴里面有0和1,在y轴里面有0和1.x轴表达的是预测的值,y轴表达的是真实的值.可以对比真实值与预测值之间的差异,可以计算当前模型衡量 ...
机器学习之信用卡欺诈检测（零基础，附数据及详细python代码2022年Tensorflow2）
首先该数据参考:机器学习项目实战之信用卡欺诈检测(零基础,附数据及详细python代码) (4条消息) 机器学习项目实战之信用卡欺诈检测(零基础,附数据及详细python代码)_西南交大-Liu_z的 ...
【实战案例】分享6种常用的信用卡欺诈检测算法（附 Python 代码）
大家好,本文旨在使用 XGBoost.随机森林.KNN.逻辑回归.SVM 和决策树解决信用卡潜在欺诈的分类问题,内容较长,建议收藏.点赞. 文章目录技术提升案例简介导入相关模块导入数据探索性 ...
机器学习实战 | Python 信用卡欺诈检测其实特简单
本文旨在使用 XGBoost.随机森林.KNN.逻辑回归.SVM 和决策树解决分类问题.喜欢记得收藏.关注.点赞. 注:文末提供技术交流群案例简介假设你受雇于帮助一家信用卡公司检测潜在的欺诈案件, ...
机器学习实战分享：用 Python 进行信用卡欺诈检测
本文旨在使用 XGBoost.随机森林.KNN.逻辑回归.SVM 和决策树解决分类问题,内容较长,建议收藏.关注.点赞. 案例简介假设你受雇于帮助一家信用卡公司检测潜在的欺诈案件,你的工作是确保客户 ...
机器学习实战：信用卡欺诈检测
本文旨在使用 XGBoost.随机森林.KNN.逻辑回归.SVM 和决策树解决分类问题案例简介假设你受雇于帮助一家信用卡公司检测潜在的欺诈案件,你的工作是确保客户不会因未购买的商品而被收取费用.给 ...
信用卡欺诈检测python_Python机器学习实战：信用卡欺诈检测
原标题:Python机器学习实战:信用卡欺诈检测本文作者:唐宇迪 ,文末有彩蛋!本期送书python! 故事背景:原始数据为个人交易记录,但是考虑数据本身的隐私性,已经对原始数据进行了类似PCA的处 ...
毕业设计 - 题目：基于大数据的信用卡欺诈检测
文章目录 0 简介 1 数据集 2 分析流程 3 数据预览 3.1 数据浏览 3.1.1 查看数据分布 4 数据规范化 4.1 amount特征缩放并去除time字段 4.2 解决样本不均衡问题 5 ...
qiuzitao机器学习（六）：信用卡欺诈检测项目
机器学习实战–信用卡欺诈检测项目学校大三校企合作的课程设计项目一.任务基础拿到的信用卡数据集是由欧洲人于2013年9月使用信用卡进行交易的数据.此数据集显示两天内发生的交易,其中284807笔交 ...

SKLearn 信用卡欺诈检测(creditcard)

SKLearn 信用卡欺诈检测(creditcard)相关推荐

最新文章

热门文章