用R语言scorecard包做一张标准评分卡

R语言的scorecard包可以满足标准评分卡的所有需求
以下利用kaggle的give me some credits数据进行逻辑回归建模（以使用熟悉scorecard包功能为主）

#导入包，读取数据
require(scorecard)
df <- read.csv("cs-training.csv",header = TRUE)

# 变量选择
#设置SeriousDlqin2yrs为目标，余下则为变量
df_all <- var_filter(df, y="SeriousDlqin2yrs")

#划分训练集与测试集按照7，3比例
df_list <- split_df(df_all,ratio = c(0.7,0.3))
train <- df_list$train
test <- df_list$test

 # woe 分箱 自动分箱
bins <- woebin(train, y="SeriousDlqin2yrs")
# bins 返回分箱，badrate， 每箱映射的woe，变量iv，分箱节点，及是否是特殊值（如：将缺失单独设为一类）
print(bins)

# 图示各变量分箱，badrate，及变量iv
woebin_plot(bins)

图示为woebin_plot(bins$age)

#必要时可以对变量进行手动分箱
# 以变量age为例（这里未进行保存）
woebin_adj(train, "SeriousDlqin2yrs", bins$age)

Adjust breaks for (1/1) age?
1: next
2: yes
3: back
Selection (1-3, goX, save): 2
Enter modified breaks: 40,50,60
Current breaks:
“40”, “50”, “60”

# 训练集和测试数据集转换为woe值
train_woe <-  woebin_ply(train, bins)
test_woe <-  woebin_ply(test, bins)

#逻辑回归建模
fit <- glm( SeriousDlqin2yrs ~ ., family = binomial(link = "logit"), data = train_woe)
summary(fit)

# 模型性能验证 ks和roc
# 预测的概率
train_pred <- predict(fit, train_woe, type = 'response')
test_pred <- predict(fit, test_woe, type = 'response')
# 性能
train_perf <- perf_eva( train_pred,train$SeriousDlqin2yrs, title = 'train')
test_perf <- perf_eva(test_pred,test$SeriousDlqin2yrs,title = 'test')

# 生成评分卡,自设基准点，发生比
card <- scorecard(bins, fit,points0 = 600, odds0 = 1/19, pdo = 50,basepoints_eq0 = FALSE)
print(card)

$basepoints
variable bin woe points
1: basepoints NA NA 575

$RevolvingUtilizationOfUnsecuredLines
variable bin count count_distr good bad badprob woe bin_iv total_iv breaks
1: RevolvingUtilizationOfUnsecuredLines [-Inf,0.2) 56904 0.5438073 55730 1174 0.02063124 -1.2363021 0.50107552 1.059351 0.2
2: RevolvingUtilizationOfUnsecuredLines [0.2,0.5) 18768 0.1793578 17834 934 0.04976556 -0.3255858 0.01653890 1.059351 0.5
3: RevolvingUtilizationOfUnsecuredLines [0.5,0.85) 13048 0.1246942 11545 1503 0.11519007 0.5850105 0.05503519 1.059351 0.85
4: RevolvingUtilizationOfUnsecuredLines [0.85, Inf) 15920 0.1521407 12455 3465 0.21765075 1.3443903 0.48670107 1.059351 Inf
is_special_values points
1: FALSE 54
2: FALSE 14
3: FALSE -26
4: FALSE -59

$age
variable bin count count_distr good bad badprob woe bin_iv total_iv breaks is_special_values points
1: age [-Inf,40) 22321 0.2133123 19957 2364 0.10590923 0.4905749 0.06354621 0.2458132 40 FALSE -16
2: age [40,56) 39135 0.3739966 35995 3140 0.08023508 0.1846426 0.01381426 0.2458132 56 FALSE -6
3: age [56,64) 19458 0.1859518 18502 956 0.04913146 -0.3390764 0.01849155 0.2458132 64 FALSE 11
4: age [64, Inf) 23726 0.2267393 23110 616 0.02596308 -1.0009739 0.14996124 0.2458132 Inf FALSE 32

$NumberOfTime30.59DaysPastDueNotWorse
variable bin count count_distr good bad badprob woe bin_iv total_iv breaks
1: NumberOfTime30.59DaysPastDueNotWorse [-Inf,1) 87863 0.83966934 84343 3520 0.04006237 -0.5526310 0.2028337 0.757567 1
2: NumberOfTime30.59DaysPastDueNotWorse [1,2) 11191 0.10694763 9465 1726 0.15423108 0.9220056 0.1354517 0.757567 2
3: NumberOfTime30.59DaysPastDueNotWorse [2, Inf) 5586 0.05338303 3756 1830 0.32760473 1.9047612 0.4192815 0.757567 Inf
is_special_values points
1: FALSE 21
2: FALSE -36
3: FALSE -74

$DebtRatio
variable bin count count_distr good bad badprob woe bin_iv total_iv breaks is_special_values points
1: DebtRatio [-Inf,0.4) 56162 0.53671636 52852 3310 0.05893665 -0.14674757 0.010850193 0.07697975 0.4 FALSE 9
2: DebtRatio [0.4,0.55) 12471 0.11918005 11547 924 0.07409189 0.09833093 0.001202505 0.07697975 0.55 FALSE -6
3: DebtRatio [0.55,0.7) 6348 0.06066514 5717 631 0.09940139 0.41990619 0.012839573 0.07697975 0.7 FALSE -27
4: DebtRatio [0.7,2.75) 8355 0.07984518 7337 1018 0.12184321 0.64870968 0.044543437 0.07697975 2.75 FALSE -41
5: DebtRatio [2.75, Inf) 21304 0.20359327 20111 1193 0.05599887 -0.20099599 0.007544039 0.07697975 Inf FALSE 13

$MonthlyIncome
variable bin count count_distr good bad badprob woe bin_iv total_iv breaks is_special_values
1: MonthlyIncome missing 20719 0.1980027 19542 1177 0.05680776 -0.1857974 0.006310120 0.06363238 missing TRUE
2: MonthlyIncome [-Inf,4000) 26827 0.2563742 24414 2413 0.08994670 0.3095138 0.028096469 0.06363238 4000 FALSE
3: MonthlyIncome [4000,5500) 16143 0.1542718 14892 1251 0.07749489 0.1469189 0.003549050 0.06363238 5500 FALSE
4: MonthlyIncome [5500,8000) 18397 0.1758123 17243 1154 0.06272762 -0.0803723 0.001096984 0.06363238 8000 FALSE
5: MonthlyIncome [8000, Inf) 22554 0.2155390 21473 1081 0.04792941 -0.3651100 0.024579753 0.06363238 Inf FALSE
points
1: 0
2: 0
3: 0
4: 0
5: 0

$NumberOfOpenCreditLinesAndLoans
variable bin count count_distr good bad badprob woe bin_iv total_iv breaks
1: NumberOfOpenCreditLinesAndLoans [-Inf,3) 9105 0.08701261 7966 1139 0.12509610 0.67876800 0.05383830 0.06585617 3
2: NumberOfOpenCreditLinesAndLoans [3,14) 80236 0.76678135 75406 4830 0.06019742 -0.12424058 0.01121874 0.06585617 14
3: NumberOfOpenCreditLinesAndLoans [14, Inf) 15299 0.14620604 14192 1107 0.07235767 0.07277502 0.00079913 0.06585617 Inf
is_special_values points
1: FALSE -2
2: FALSE 0
3: FALSE 0

$NumberOfTimes90DaysLate
variable bin count count_distr good bad badprob woe bin_iv total_iv breaks is_special_values
1: NumberOfTimes90DaysLate [-Inf,1) 98791 0.94410359 94139 4652 0.04708931 -0.3836754 0.1179653 0.8183982 1 FALSE
2: NumberOfTimes90DaysLate [1, Inf) 5849 0.05589641 3425 2424 0.41442982 2.2781174 0.7004328 0.8183982 Inf FALSE
points
1: 15
2: -87

$NumberRealEstateLoansOrLines
variable bin count count_distr good bad badprob woe bin_iv total_iv breaks
1: NumberRealEstateLoansOrLines [-Inf,1) 39215 0.3747611 35964 3251 0.08290195 0.2202440 0.020002734 0.05179564 1
2: NumberRealEstateLoansOrLines [1,2) 36452 0.3483563 34533 1919 0.05264457 -0.2663114 0.022038294 0.05179564 2
3: NumberRealEstateLoansOrLines [2,3) 22083 0.2110378 20778 1305 0.05909523 -0.1438919 0.004106915 0.05179564 3
4: NumberRealEstateLoansOrLines [3, Inf) 6890 0.0658448 6289 601 0.08722787 0.2758374 0.005647699 0.05179564 Inf
is_special_values points
1: FALSE -10
2: FALSE 13
3: FALSE 7
4: FALSE -13

$NumberOfTime60.89DaysPastDueNotWorse
variable bin count count_distr good bad badprob woe bin_iv total_iv breaks
1: NumberOfTime60.89DaysPastDueNotWorse [-Inf,1) 99334 0.94929281 94208 5126 0.05160368 -0.2873797 0.06931066 0.5711802 1
2: NumberOfTime60.89DaysPastDueNotWorse [1, Inf) 5306 0.05070719 3356 1950 0.36750848 2.0808794 0.50186959 0.5711802 Inf
is_special_values points
1: FALSE 8
2: FALSE -61

$NumberOfDependents
variable bin count count_distr good bad badprob woe bin_iv total_iv breaks is_special_values
1: NumberOfDependents missing 2700 0.02580275 2576 124 0.04592593 -0.40991172 0.003639670 0.03289703 missing TRUE
2: NumberOfDependents [-Inf,1) 60553 0.57867928 56926 3627 0.05989794 -0.12954652 0.009184289 0.03289703 1 FALSE
3: NumberOfDependents [1,2) 18435 0.17617546 17078 1357 0.07360998 0.09128509 0.001527285 0.03289703 2 FALSE
4: NumberOfDependents [2,3) 13652 0.13046636 12568 1084 0.07940229 0.17330380 0.004224434 0.03289703 3 FALSE
5: NumberOfDependents [3, Inf) 9300 0.08887615 8416 884 0.09505376 0.37036692 0.014321353 0.03289703 Inf FALSE
points
1: 6
2: 2
3: -1
4: -2
5: -5

# 信用评分
# 实际评分
train_score <- scorecard_ply(train, card, print_step = 0)
# 验证集评分
test_score <- scorecard_ply(test, card, print_step = 0)

# 模型的稳定性度量
# psi
psi_result <- perf_psi(score = list(train = train_score, test = test_score),label = list(train = train$SeriousDlqin2yrs, test = test$SeriousDlqin2yrs)
)
psi_result

用R语言scorecard包做一张标准评分卡相关推荐

R语言 scorecard包评分卡
我会把文章及时的更新到公共号上,欢迎大家的关注. library(scorecard) data("germancredit") print(dim(germancredit)) ...
R语言e1071包做SVR怎么取出权重
w = drop(t(svm_model$coefs)%*%as.matrix(X[svm_model$index,1:length(SFI)]))#取出超平面的权重向量 svm_model为训练的模 ...
r语言 bsda包_使用R语言creditmodel包进行Vintage分析或留存率分析
1 什么是vintage分析? Vintage分析(账龄分析法)被广泛应用于信用卡及信贷行业,这个概念起源于葡萄酒,即不同年份出产的葡萄酒的品质有差异,那么不同时期开户或者放款的资产质量也有差异,其核 ...
使用R语言creditmodel包进行Vintage分析或留存率分析
1 什么是vintage分析? Vintage分析(账龄分析法)被广泛应用于信用卡及信贷行业,这个概念起源于葡萄酒,即不同年份出产的葡萄酒的品质有差异,那么不同时期开户或者放款的资产质量也有差异,其核 ...
R语言可视化包ggplot2在一张图中画出两条线实战
R语言可视化包ggplot2在一张图中画出两条线实战目录 R语言可视化包ggplot2在一张图中画出两条线实战
[置顶]R语言 ggplot2包
R语言 ggplot2包的学习分析数据要做的第一件事情,就是观察它.对于每个变量,哪些值是最常见的?值域是大是小?是否有异常观测? ggplot2图形之基本语法: ggplot2的核心理念是将绘图 ...
R语言geodetector包基于栅格图像实现地理探测器操作
本文介绍基于R语言中的geodetector包,依据多张栅格图像数据,实现地理探测器(Geodetector)操作的详细方法. 需要说明的是,在R语言中进行地理探测器操作,可以分别通过geod ...
R 回归虚拟变量na_工具amp;方法 | R语言机器学习包大全（共45个包）
机器学习,是一门多学科交叉的人工智能领域的分析技术,它使用算法解析数据,从中学习,然后对世界上的某件事情做出决定或预测. 目前,常见机器学习的研究方向主要包括决策树.随机森林.神经网络.贝叶斯学习和支 ...
数据分析必备：掌握这个R语言基础包1%的功能，你就很牛了
导读:无论数据分析的目的是什么,将数据导入R中的过程都是不可或缺的.毕竟巧妇难为无米之炊. utils包是R语言的基础包之一.这个包最重要的任务其实并不是进行数据导入,而是为编程和开发R包提供非常实用 ...

用R语言scorecard包做一张标准评分卡

用R语言scorecard包做一张标准评分卡相关推荐

最新文章

热门文章