如何判断在信用评分卡模型 分箱中变量分箱是否合理

信用卡评分模型优化 - abc的博客 - CSDN博客
信用卡评分模型优化
原文出处:
http://blog.csdn.net/csqazwsxedc/article/details/
我已经在博客里转载了,
存在问题:
1、源数据的获取。要去国外网站(/c/GiveMeSomeCredit/data)下载,需要注册账号,注册时需要用的Google的验证码,因为国内封了Google,所以这个必须要翻墙才能显示。简便的解决方法是,360浏览器有个,穿越苍穹,有5分钟试用时间,或者其他翻墙软件。另外,我将数据源上传到了我的资源空间(http://download.csdn.net/detail/abc/9904440),请自行下载。
2、原文中前面代码没有补全,现在补上。
3、原文的WOE转换中分箱完全是手动的等距分箱,这个非常不合理。应该采用自动分箱(这里可以采用卡方分箱也可以采用包smbinning中的最优分箱)总之最好不要人工等距分箱,(最起码也是等频分箱)
4、逻辑回归建模时,自变量最好是分箱后变量值对应的woe值,模型效果会比用原来好,这也是一般逻辑回归模型都有woe分箱的原因,不过,直接用变量源数据也是可以的。
代码如下:
setwd('F:/study/code/R_ompany/credit')
#library(devtools)
#install_github(&riv&,&tomasgreif&)
traindata0 &- read.csv(&cs-training.csv&,stringsAsFactors =F)
traindata&-traindata0[,2:12]
#traindata&-traindata0[,3:12]
y&-traindata0[,2]
names(traindata)&-c('y','x1','x2','x3','x4','x5','x6','x7','x8','x9','x10')
#缺失值分析
summary(traindata)
library(VIM)
matrixplot(traindata)
library(mice)
md.pattern(traindata)
#使用knn进行数据补全
library(DMwR)
traindata&-knnImputation(traindata,k=10,meth = &weighAvg&)#knn处理缺失值是挺慢的
traindata1&-traindata
###异常值监测处理
traindata&-traindata1
names(traindata)&-c('y','x1','x2','x3','x4','x5','x6','x7','x8','x9','x10')
boxplot(traindata)#箱线图,x5太大。
unique(traindata$x2)#0为异常值
traindata&-traindata[-which(traindata$x2==0),] #剔除异常值
boxplot(traindata[,c(4,8,10)])#箱线图
unique(traindata$x3)#96,98为异常值
traindata&-traindata[-which(traindata$x3&=96),] #剔除异常值
#which(traindata$x3 %in% c(96,98))
boxplot(traindata$x6)#箱线图
unique(traindata$x6)#96,98为异常值
boxplot(traindata$x7)#箱线图
unique(traindata$x7)#96,98为异常值
boxplot(traindata$x8)#箱线图
unique(traindata$x8)#96,98为异常值
boxplot(traindata$x9)#箱线图
unique(traindata$x9)#96,98为异常值
###########统计各个指标的分位数及超出上下限的数量
samp_num&-traindata[,c(2,5,6,11)]#取出数值型变量
nn &- nrow(samp_num)
mystats=function(x){
num_unique=length(unique(x))
mmean=mean(x)
qq_bin=as.numeric(quantile(x,prob=c(0,0.01,0.05,0.1,0.25,0.5
,0.75,0.9,0.95,0.99,1),na.rm=T))
b_up=mean(x)+3*sd(x)
n_max=sum(x&b_up,na.rm=T)
n_max_p=n_max/nn
return(c(num_unique,mmean,b_up,n_max,n_max_p,qq_bin))
dim(samp_num)
tt &- apply(samp_num,2,mystats)
ttt &- t(as.data.frame(tt))
#处理极端值
extre_value=function(x){
x_limit=mean(x)+3*sd(x)#上限
x[x&0]=0#小于0的替换为0
x[x&x_limit]=x_limit#超出上限的替换为上限
rm(x_limit);gc()
d_num &- apply(traindata[,c(2,5,6)],2,extre_value)##对输入的每一列进行极端值处理,向量化操作
traindata[,c(2,5,6)]&-d_num
###其它变量占不作处理。
##################变量分析
#######单变量检测分析
library(&ggplot2&)
ggplot(traindata, aes(x = x2, y = ..density..)) + geom_histogram(fill = &blue&, colour = &grey60&, size = 0.2, alpha = 0.2) + geom_density()
#可以看到年龄变量大致呈正态分布,符合统计分析的假设。
ggplot(traindata, aes(x = x5, y = ..density..)) + geom_histogram(fill = &blue&, colour = &grey60&, size = 0.2, alpha = 0.2) + geom_density() + xlim(1, 20000)
#月收入也大致呈正态分布,符合统计分析的需要。
######################分箱(可选,也可以跳过,采用后面的smbinning包的分箱方法)
#卡方自动分箱函数
chimerge=function(data,begin=1000,end=4)
#data为两列,第一列为实数变量,第二列为取值为0,1的因变量
#begin表示初始化分段数量,end表示分类数量
breaks=seq(min(data[,1]),max(data[,1]),(max(data[,1])-min(data[,1]))/begin)
#划分初始区间
data[,3]=cut(data[,1],breaks)
#将数值型变量变成分类变量
tj1=table(data[data[,2]==1,3])
tj0=table(data[data[,2]==0,3])
while(length(breaks)&(end+2))
kafang=c()
for (i in 1:(length(breaks)-2))
#算出每个区间与下一个区间的卡方值
b=tj1[i+1]
d=tj0[i+1]
if (a+b==0 || d+c==0 || a+c==0 || b+d==0)
kafang[i]=0
kafang[i]=((a*d-b*c)^2*(a+b+c+d)/(a+b)/(d+c)/(a+c)/(b+d))
index=which(kafang==min(kafang))[1]
#区间最小卡方值的下标
breaks=breaks[-(index+1)]
#合并两个区间
tj1[index]=tj1[index]+tj1[index+1]
tj0[index]=tj0[index]+tj0[index+1]
tj1=tj1[-(index+1)]
tj0=tj0[-(index+1)]
mydata&-traindata
brks&- chimerge(traindata[,c(2,1)])
mydata$x1&-cut(traindata$x1,brks,include.lowest =T)
ss&-table(mydata[,c(1,2)])
ss[1,]/ss[2,] ###递增或者递减最好
pct0_x1&-ss[1,]/sum(ss[1,])
pct1_x1&-ss[2,]/sum(ss[2,])
woe_x1&-log(pct1_x1/pct0_x1)
pct_x1&-pct1_x1-pct0_x1
iv_x1&-sum(woe_x1*pct_x1)
iv_x1 #0.77
mydata$x1_woe&-as.character(mydata$x1)
mydata$x1_woe[which(mydata$x1==names(woe_x1[1]))]&-woe_x1[[1]]
mydata$x1_woe[which(mydata$x1==names(woe_x1[2]))]&-woe_x1[[2]]
mydata$x1_woe[which(mydata$x1==names(woe_x1[3]))]&-woe_x1[[3]]
mydata$x1_woe[which(mydata$x1==names(woe_x1[4]))]&-woe_x1[[4]]
mydata$x1_woe[which(mydata$x1==names(woe_x1[5]))]&-woe_x1[[5]]
head(mydata$x1_woe)
brks&- chimerge(traindata[,c(3,1)],100,5)
mydata$x2&-cut(traindata$x2,brks,include.lowest =T)
ss&-table(mydata[,c(1,3)])
ss[1,]/ss[2,] ###递增或者递减最好
pct0_x2&-ss[1,]/sum(ss[1,])
pct1_x2&-ss[2,]/sum(ss[2,])
woe_x2&-log(pct1_x2/pct0_x2)
pct_x2&-pct1_x2-pct0_x2
iv_x2&-sum(woe_x2*pct_x2)
iv_x2 #0.255
mydata$x2_woe&-as.character(mydata$x2)
mydata$x2_woe[which(mydata$x2==names(woe_x2[1]))]&-woe_x2[[1]]
mydata$x2_woe[which(mydata$x2==names(woe_x2[2]))]&-woe_x2[[2]]
mydata$x2_woe[which(mydata$x2==names(woe_x2[3]))]&-woe_x2[[3]]
mydata$x2_woe[which(mydata$x2==names(woe_x2[4]))]&-woe_x2[[4]]
mydata$x2_woe[which(mydata$x2==names(woe_x2[5]))]&-woe_x2[[5]]
mydata$x2_woe[which(mydata$x2==names(woe_x2[6]))]&-woe_x2[[6]]
head(mydata$x2_woe)
unique(traindata$x3)
brks&- chimerge(traindata[,c(4,1)],100,5)
mydata$x3&-cut(traindata$x3,brks,include.lowest =T)
ss&-table(mydata[,c(1,4)])
ss[1,]/ss[2,] ###递增或者递减最好
pct0_x3&-ss[1,]/sum(ss[1,])
pct1_x3&-ss[2,]/sum(ss[2,])
woe_x3&-log(pct1_x3/pct0_x3)
pct_x3&-pct1_x3-pct0_x3
iv_x3&-sum(woe_x3*pct_x3)
mydata$x3_woe&-as.character(mydata$x3)
mydata$x3_woe[which(mydata$x3==names(woe_x3[1]))]&-woe_x3[[1]]
mydata$x3_woe[which(mydata$x3==names(woe_x3[2]))]&-woe_x3[[2]]
mydata$x3_woe[which(mydata$x3==names(woe_x3[3]))]&-woe_x3[[3]]
mydata$x3_woe[which(mydata$x3==names(woe_x3[4]))]&-woe_x3[[4]]
mydata$x3_woe[which(mydata$x3==names(woe_x3[5]))]&-woe_x3[[5]]
mydata$x3_woe[which(mydata$x3==names(woe_x3[6]))]&-woe_x3[[6]]
head(mydata$x3_woe)
unique(traindata$x4)
brks&- chimerge(traindata[,c(5,1)],1000,5)
mydata$x4&-cut(traindata$x4,brks,include.lowest =T)
ss&-table(mydata[,c(1,5)])
ss[1,]/ss[2,] ###递增或者递减最好
pct0_x4&-ss[1,]/sum(ss[1,])
pct1_x4&-ss[2,]/sum(ss[2,])
woe_x4&-log(pct1_x4/pct0_x4)
pct_x4&-pct1_x4-pct0_x4
iv_x4&-sum(woe_x4*pct_x4)
mydata$x4_woe&-as.character(mydata$x4)
mydata$x4_woe[which(mydata$x4==names(woe_x4[1]))]&-woe_x4[[1]]
mydata$x4_woe[which(mydata$x4==names(woe_x4[2]))]&-woe_x4[[2]]
mydata$x4_woe[which(mydata$x4==names(woe_x4[3]))]&-woe_x4[[3]]
mydata$x4_woe[which(mydata$x4==names(woe_x4[4]))]&-woe_x4[[4]]
mydata$x4_woe[which(mydata$x4==names(woe_x4[5]))]&-woe_x4[[5]]
mydata$x4_woe[which(mydata$x4==names(woe_x4[6]))]&-woe_x4[[6]]
head(mydata$x4_woe)
unique(traindata$x5)
brks&- chimerge(traindata[,c(6,1)],1000,6)
mydata$x5&-cut(traindata$x5,brks,include.lowest =T)
ss&-table(mydata[,c(1,6)])
ss[1,]/ss[2,] ###递增或者递减最好
pct0_x5&-ss[1,]/sum(ss[1,])
pct1_x5&-ss[2,]/sum(ss[2,])
woe_x5&-log(pct1_x5/pct0_x5)
pct_x5&-pct1_x5-pct0_x5
iv_x5&-sum(woe_x5*pct_x5)
mydata$x5_woe&-as.character(mydata$x5)
mydata$x5_woe[which(mydata$x5==names(woe_x5[1]))]&-woe_x5[[1]]
mydata$x5_woe[which(mydata$x5==names(woe_x5[2]))]&-woe_x5[[2]]
mydata$x5_woe[which(mydata$x5==names(woe_x5[3]))]&-woe_x5[[3]]
mydata$x5_woe[which(mydata$x5==names(woe_x5[4]))]&-woe_x5[[4]]
mydata$x5_woe[which(mydata$x5==names(woe_x5[5]))]&-woe_x5[[5]]
mydata$x5_woe[which(mydata$x5==names(woe_x5[6]))]&-woe_x5[[6]]
mydata$x5_woe[which(mydata$x5==names(woe_x5[7]))]&-woe_x5[[7]]
head(mydata$x5_woe)
unique(traindata$x6)
brks&- chimerge(traindata[,c(7,1)],100,6)
mydata$x6&-cut(traindata$x6,brks,include.lowest =T)
ss&-table(mydata[,c(1,7)])
ss[1,]/ss[2,] ###递增或者递减最好
pct0_x6&-ss[1,]/sum(ss[1,])
pct1_x6&-ss[2,]/sum(ss[2,])
woe_x6&-log(pct1_x6/pct0_x6)
##形态较好
pct_x6&-pct1_x6-pct0_x6
iv_x6&-sum(woe_x6*pct_x6)
mydata$x6_woe&-as.character(mydata$x6)
mydata$x6_woe[which(mydata$x6==names(woe_x6[1]))]&-woe_x6[[1]]
mydata$x6_woe[which(mydata$x6==names(woe_x6[2]))]&-woe_x6[[2]]
mydata$x6_woe[which(mydata$x6==names(woe_x6[3]))]&-woe_x6[[3]]
mydata$x6_woe[which(mydata$x6==names(woe_x6[4]))]&-woe_x6[[4]]
mydata$x6_woe[which(mydata$x6==names(woe_x6[5]))]&-woe_x6[[5]]
mydata$x6_woe[which(mydata$x6==names(woe_x6[6]))]&-woe_x6[[6]]
mydata$x6_woe[which(mydata$x6==names(woe_x6[7]))]&-woe_x6[[7]]
head(mydata$x6_woe)
unique(traindata$x7)
brks&- chimerge(traindata[,c(8,1)],100,5)
mydata$x7&-cut(traindata$x7,brks,include.lowest =T)
ss&-table(mydata[,c(1,8)])
ss[1,]/ss[2,] ###递增或者递减最好
pct0_x7&-ss[1,]/sum(ss[1,])
pct1_x7&-ss[2,]/sum(ss[2,])
woe_x7&-log(pct1_x7/pct0_x7)
##形态较好
pct_x7&-pct1_x7-pct0_x7
iv_x7&-sum(woe_x7*pct_x7)
mydata$x7_woe&-as.character(mydata$x7)
mydata$x7_woe[which(mydata$x7==names(woe_x7[1]))]&-woe_x7[[1]]
mydata$x7_woe[which(mydata$x7==names(woe_x7[2]))]&-woe_x7[[2]]
mydata$x7_woe[which(mydata$x7==names(woe_x7[3]))]&-woe_x7[[3]]
mydata$x7_woe[which(mydata$x7==names(woe_x7[4]))]&-woe_x7[[4]]
mydata$x7_woe[which(mydata$x7==names(woe_x7[5]))]&-woe_x7[[5]]
mydata$x7_woe[which(mydata$x7==names(woe_x7[6]))]&-woe_x7[[6]]
head(mydata$x7_woe)
unique(traindata$x8)
brks&- chimerge(traindata[,c(9,1)],100,5)
mydata$x8&-cut(traindata$x8,brks,include.lowest =T)
ss&-table(mydata[,c(1,9)])
ss[1,]/ss[2,] ###递增或者递减最好
pct0_x8&-ss[1,]/sum(ss[1,])
pct1_x8&-ss[2,]/sum(ss[2,])
woe_x8&-log(pct1_x8/pct0_x8)
##形态较好
pct_x8&-pct1_x8-pct0_x8
iv_x8&-sum(woe_x8*pct_x8)
mydata$x8_woe&-as.character(mydata$x8)
mydata$x8_woe[which(mydata$x8==names(woe_x8[1]))]&-woe_x8[[1]]
mydata$x8_woe[which(mydata$x8==names(woe_x8[2]))]&-woe_x8[[2]]
mydata$x8_woe[which(mydata$x8==names(woe_x8[3]))]&-woe_x8[[3]]
mydata$x8_woe[which(mydata$x8==names(woe_x8[4]))]&-woe_x8[[4]]
mydata$x8_woe[which(mydata$x8==names(woe_x8[5]))]&-woe_x8[[5]]
mydata$x8_woe[which(mydata$x8==names(woe_x8[6]))]&-woe_x8[[6]]
head(mydata$x8_woe)
unique(traindata$x9)
brks&- chimerge(traindata[,c(10,1)],100,2)
mydata$x9&-cut(traindata$x9,brks,include.lowest =T)
ss&-table(mydata[,c(1,10)])
ss[1,]/ss[2,] ###递增或者递减最好
pct0_x9&-ss[1,]/sum(ss[1,])
pct1_x9&-ss[2,]/sum(ss[2,])
woe_x9&-log(pct1_x9/pct0_x9)
##形态较好
pct_x9&-pct1_x9-pct0_x9
iv_x9&-sum(woe_x9*pct_x9)
mydata$x9_woe&-as.character(mydata$x9)
mydata$x9_woe[which(mydata$x9==names(woe_x9[1]))]&-woe_x9[[1]]
mydata$x9_woe[which(mydata$x9==names(woe_x9[2]))]&-woe_x9[[2]]
mydata$x9_woe[which(mydata$x9==names(woe_x9[3]))]&-woe_x9[[3]]
head(mydata$x9_woe)
unique(traindata$x10)
brks&- chimerge(traindata[,c(11,1)],1000,6)
mydata$x10&-cut(traindata$x10,brks,include.lowest =T)
ss&-table(mydata[,c(1,11)])
ss[1,]/ss[2,] ###递增或者递减最好
pct0_x10&-ss[1,]/sum(ss[1,])
pct1_x10&-ss[2,]/sum(ss[2,])
woe_x10&-log(pct1_x10/pct0_x10)
##形态较好
pct_x10&-pct1_x10-pct0_x10
iv_x10&-sum(woe_x10*pct_x10)
mydata$x10_woe&-as.character(mydata$x10)
mydata$x10_woe[which(mydata$x10==names(woe_x10[1]))]&-woe_x10[[1]]
mydata$x10_woe[which(mydata$x10==names(woe_x10[2]))]&-woe_x10[[2]]
mydata$x10_woe[which(mydata$x10==names(woe_x10[3]))]&-woe_x10[[3]]
mydata$x10_woe[which(mydata$x10==names(woe_x10[4]))]&-woe_x10[[4]]
mydata$x10_woe[which(mydata$x10==names(woe_x10[5]))]&-woe_x10[[5]]
mydata$x10_woe[which(mydata$x10==names(woe_x10[6]))]&-woe_x10[[6]]
mydata$x10_woe[which(mydata$x10==names(woe_x10[7]))]&-woe_x10[[7]]
head(mydata$x10_woe)
c(iv_x1,iv_x2,iv_x3,iv_x4,iv_x5,iv_x6,iv_x7,iv_x8,iv_x9,iv_x10)
#hist(traindata$x1, brks)
#plot(cut(traindata$x1, brks))
#######变量之间相关性检测
#建模之前首先得检验变量之间的相关性,如果变量之间相关性显著,会影响模型的预测效果
cor1&-cor(traindata[,1:11])
library(&corrplot&)
corrplot(cor1)
corrplot(cor1,method = &number&)
#由上图可以看出,各变量之间的相关性是非常小的。
#######切分数据
table(traindata$y)
# 由上表看出,对于响应变量SeriousDlqin2yrs,存在明显的类失衡问题,SeriousDlqin2yrs等于1的观测为9879,仅为所有观测值的6.6%。
#因此我们需要对非平衡数据进行处理,在这里可以采用SMOTE算法,用R对稀有事件进行超级采样。
# 我们利用caret包中的createDataPartition(数据分割功能)函数将数据随机分成相同的两份
library(caret)
set.seed(1234)
splitIndex&-createDataPartition(traindata$y,time=1,p=0.5,list=FALSE)
train&-traindata[splitIndex,]
test&-traindata[-splitIndex,]
prop.table(table(train$y))
prop.table(table(test$y))
#两者的分类结果是平衡的,仍然有6.6%左右的代表,我们仍然处于良好的水平。
#因此可以采用这份切割的数据进行建模及预测。
###########五、Logistic回归
# Logistic回归在信用评分卡开发中起到核心作用。由于其特点,以及对自变量进行了证据权重转换(WOE),
# Logistic回归的结果可以直接转换为一个汇总表,即所谓的标准评分卡格式。
fit&-glm(y~.,train,family = &binomial&)
summary(fit)
# 可以看出,利用全变量进行回归,模型拟合效果并不是很好,其中x1,x6变量的p值未能通过检验,
# 在此直接剔除这三个变量,利用剩余的变量对y进行回归。
fit2&-glm(y~x2+x3+x7+x9,train,family = &binomial&)
summary(fit2)
#第二个回归模型所有变量都通过了检验,甚至AIC值(赤池信息准则)更小,所有模型的拟合效果更好些
fit3&-glm(y~x2+x3+x4+x5+x7+x8+x9+x10,train,family = &binomial&)
summary(fit3)
###模型评估
#对测试集预测
pre &- predict(fit3,test)
#在R中,可以利用pROC包,它能方便比较两个分类器,还能自动标注出最优的临界点,图看起来也比较漂亮。
library(pROC)
modelroc &- roc(test$y,pre)
plot(modelroc, print.auc=TRUE, auc.polygon=TRUE, grid=c(0.1, 0.2),
grid.col=c(&green&, &red&), max.auc.polygon=TRUE,
auc.polygon.col=&skyblue&, print.thres=TRUE)
#图中最优点FPR=1-TNR=0.845,TPR=0.638,AUC值为0.8102,说明该模型的预测效果还是不错的,正确较高。
######WOE转换
# 证据权重(Weight of Evidence,WOE)转换可以将Logistic回归模型转变为标准评分卡格式。
# 引入WOE转换的目的并不是为了提高模型质量,只是一些变量不应该被纳入模型,这或者是因为它们不能增加模型值,
# 或者是因为与其模型相关系数有关的误差较大,其实建立标准信用评分卡也可以不采用WOE转换。这种情况下,
# Logistic回归模型需要处理更大数量的自变量。尽管这样会增加建模程序的复杂性,但最终得到的评分卡都是一样的。
# 用WOE(x)替换变量x。WOE()=ln[(违约/总违约)/(正常/总正常)]。
# 通过上述的Logistic回归,剔除x1,x4,x5,x6三个变量,对剩下的变量进行WOE转换。
library(smbinning)
#######各变量的分组,计算woe
####看所有iv值
sumivt&-smbinning.sumiv(df=train,y=&y&) # IV for eache variable
sumivt # Display table with IV by characteristic
par(mfrow=c(1,1))
smbinning.sumiv.plot(sumivt,cex=1) # Plot IV summary table
###发现x1,x8,x9不在,重点处理
result_x1=smbinning(df=train,y='y',x=&x1&,p=0.05)
result_x1$ivtable
#自定义分组,等比划分
per&-as.vector(quantile(train$x1,probs=seq(0,1,0.2),na.rm=T))
breaks&-per[2:(length(per)-1)]
result_x1=smbinning.custom(df=train,y='y',x=&x1&,cuts=breaks)
result_x1$ivtable
smbinning.plot(result_x1,option=&WoE&,sub=&x1&)#x1没有分组结果
result_x1$iv
unique(train$x2)
result_x2=smbinning(df=train,y='y',x=&x2&,p=0.01)
smbinning.plot(result_x2,option=&WoE&,sub=&x2&)#看woe趋势
result_x2$iv #看iv值
unique(train$x3)
result_x3=smbinning(df=train,y='y',x=&x3&,p=0.01)
smbinning.plot(result_x3,option=&WoE&,sub=&x3&)#看woe趋势
result_x3$iv #看iv值
unique(train$x4)
result_x4=smbinning(df=train,y='y',x=&x4&,p=0.001)
smbinning.plot(result_x4,option=&WoE&,sub=&x4&)#看woe趋势
result_x4$iv #看iv值
unique(train$x5)
result_x5=smbinning(df=train,y='y',x=&x5&,p=0.001)
smbinning.plot(result_x5,option=&WoE&,sub=&x5&)#看woe趋势
result_x5$iv #看iv值
unique(train$x6)
result_x6=smbinning(df=train,y='y',x=&x6&,p=0.01)
smbinning.plot(result_x6,option=&WoE&,sub=&x6&)#看woe趋势
result_x6$iv #看iv值
unique(train$x7)
result_x7=smbinning(df=train,y='y',x=&x7&,p=0.001)
smbinning.plot(result_x7,option=&WoE&,sub=&x7&)#看woe趋势
result_x7$iv #看iv值
result_x7$ivtable
unique(train$x8)
result_x8=smbinning(df=train,y='y',x=&x8&,p=0.01)
smbinning.plot(result_x8,option=&WoE&,sub=&x8&)#x1没有分组结果
#自定义分组,等比划分
per&-as.vector(quantile(train$x8,probs=seq(0,1,0.25),na.rm=T))
breaks&-per[2:(length(per)-1)]
result_x8=smbinning.custom(df=train,y='y',x=&x8&,cuts=breaks)
result_x8$ivtable
smbinning.plot(result_x8,option=&WoE&,sub=&x8&)#x1没有分组结果
result_x8$iv #看iv值
unique(train$x9)
result_x9=smbinning(df=train,y='y',x=&x9&,p=0.01)
smbinning.plot(result_x9,option=&WoE&,sub=&x9&)#x1没有分组结果
result_x9$iv #看iv值
unique(train$x10)
result_x10=smbinning(df=train,y='y',x=&x10&,p=0.0001)
smbinning.plot(result_x10,option=&WoE&,sub=&x10&)#看woe趋势
result_x10$iv #看iv值
result_x10$ivtable
#####对变量进行WOE变换
###修改smbinning.gen函数源码
smbinning.wen&-function (df, ivout, chrname = &NewChar&)
df = cbind(df, tmpname = NA)
ncol = ncol(df)
col_id = ivout$col_id
b = ivout$bands
c=ivout$ivtable[,13]
df[, ncol][is.na(df[, col_id])] = 0
df[, ncol][df[, col_id] &= b[2]] = c[1]
if (length(b) & 3) {
for (i in 2:(length(b) - 2)) {
df[, ncol][df[, col_id] & b[i] & df[, col_id] &=
b[i + 1]] = c[i]
df[, ncol][df[, col_id] & b[length(b) - 1]] = c[length(b) - 1]
#df[, ncol] = as.factor(df[, ncol])##转换为因子类型
names(df)[names(df) == &tmpname&] = chrname
return(df)
train=train[,1:11]
train=smbinning.wen(train, result_x1, chrname = &wx1&)#增加一列
head(train$wx1)
table(train$wx1)
##其他类似
train=smbinning.wen(train, result_x2, chrname = &wx2&)#增加一列
train=smbinning.wen(train, result_x3, chrname = &wx3&)#增加一列
train=smbinning.wen(train, result_x4, chrname = &wx4&)#增加一列
train=smbinning.wen(train, result_x5, chrname = &wx5&)#增加一列
train=smbinning.wen(train, result_x6, chrname = &wx6&)#增加一列
train=smbinning.wen(train, result_x7, chrname = &wx7&)#增加一列
train=smbinning.wen(train, result_x8, chrname = &wx8&)#增加一列
train=smbinning.wen(train, result_x9, chrname = &wx9&)#增加一列
train=smbinning.wen(train, result_x10, chrname = &wx10&)#增加一列
######WOE DataFrame构建:
trainWOE =train[,12:21]
#####################七、评分卡的创建和实施
#因为数据中“1”代表的是违约,直接建模预测,求的是“发生违约的概率”,log(odds)即为“坏好比”。
#为了符合常规理解,分数越高,信用越好,所有就调换“0”和“1”,使建模预测结果为“不发生违约的概率”,最后log(odds)即表示为“好坏比”。
trainWOE$y = 1-train$y
glm.fit = glm(y~.,data = trainWOE,family = binomial(link = logit))
summary(glm.fit)
coe = (glm.fit$coefficients)
###用woe值,相关性更低
cor1&-cor(trainWOE[,1:11])
library(&corrplot&)
corrplot(cor1)
corrplot(cor1,method = &number&)
####所有的变量效果更好,以下评分卡,即为所有
fit4&-glm(y~.,trainWOE,family = &binomial&)
summary(fit4)
fit5&-glm(y~wx1+wx2+wx3+wx4+wx5+wx7+wx8+wx9,trainWOE,family = &binomial&)
summary(fit5)
p &- 20/log(2)
q &- 600-20*log(15)/log(2)
Score=q + p*as.numeric(coe[1])+p*as.numeric(coe[2])*trainWOE$wx1 +p*as.numeric(coe[3])*trainWOE$wx2
+p*as.numeric(coe[4])*trainWOE$wx3 +p*as.numeric(coe[5])*trainWOE$wx4+p*as.numeric(coe[6])*trainWOE$wx5
+p*as.numeric(coe[7])*trainWOE$wx6 +p*as.numeric(coe[8])*trainWOE$wx7+p*as.numeric(coe[9])*trainWOE$wx8
+p*as.numeric(coe[10])*trainWOE$wx9+p*as.numeric(coe[11])*trainWOE$wx10
#个人总评分=基础分+各部分得分
#基础分为:
base &- q + p*as.numeric(coe[1])
#1、对各变量进行打分
##构造计算分值函数:
getscore&-function(i,x){
score = round(p*as.numeric(coe[i])*x,0)
return(score)
# 2、计算各变量分箱得分:
x1&-as.data.frame(getscore(2,result_x1$ivtable[1:(length(result_x1$bands)-1),13]))
rownames(x1) &-result_x1$ivtable[1:(length(result_x1$bands)-1),1]
colnames(x1)&-'x1'
x2&-as.data.frame(getscore(2,result_x2$ivtable[1:(length(result_x2$bands)-1),13]))
rownames(x2) &-result_x2$ivtable[1:(length(result_x2$bands)-1),1]
colnames(x2)&-'x2'
x3&-as.data.frame(getscore(2,result_x3$ivtable[1:(length(result_x3$bands)-1),13]))
rownames(x3) &-result_x3$ivtable[1:(length(result_x3$bands)-1),1]
colnames(x3)&-'x3'
x4&-as.data.frame(getscore(2,result_x4$ivtable[1:(length(result_x4$bands)-1),13]))
rownames(x4) &-result_x4$ivtable[1:(length(result_x4$bands)-1),1]
colnames(x4)&-'x4'
x5&-as.data.frame(getscore(2,result_x5$ivtable[1:(length(result_x5$bands)-1),13]))
rownames(x5) &-result_x5$ivtable[1:(length(result_x5$bands)-1),1]
colnames(x5)&-'x5'
x6&-as.data.frame(getscore(2,result_x6$ivtable[1:(length(result_x6$bands)-1),13]))
rownames(x6) &-result_x6$ivtable[1:(length(result_x6$bands)-1),1]
colnames(x6)&-'x6'
x7&-as.data.frame(getscore(2,result_x7$ivtable[1:(length(result_x7$bands)-1),13]))
rownames(x7) &-result_x7$ivtable[1:(length(result_x7$bands)-1),1]
colnames(x7)&-'x7'
x8&-as.data.frame(getscore(2,result_x8$ivtable[1:(length(result_x8$bands)-1),13]))
rownames(x8) &-result_x8$ivtable[1:(length(result_x8$bands)-1),1]
colnames(x8)&-'x8'
x9&-as.data.frame(getscore(2,result_x9$ivtable[1:(length(result_x9$bands)-1),13]))
rownames(x9) &-result_x9$ivtable[1:(length(result_x9$bands)-1),1]
colnames(x9)&-'x9'
x10&-as.data.frame(getscore(2,result_x10$ivtable[1:(length(result_x10$bands)-1),13]))
rownames(x10) &-result_x10$ivtable[1:(length(result_x10$bands)-1),1]
colnames(x10)&-'x10'
score&-list(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10)###整数分的评分卡
#####非整数的评分
train$score&-q + p*as.numeric(coe[1])+p*as.numeric(coe[2])*trainWOE$wx1 +p*as.numeric(coe[3])*trainWOE$wx2
+p*as.numeric(coe[4])*trainWOE$wx3 +p*as.numeric(coe[5])*trainWOE$wx4+p*as.numeric(coe[6])*trainWOE$wx5
+p*as.numeric(coe[7])*trainWOE$wx6 +p*as.numeric(coe[8])*trainWOE$wx7+p*as.numeric(coe[9])*trainWOE$wx8
+p*as.numeric(coe[10])*trainWOE$wx9+p*as.numeric(coe[11])*trainWOE$wx10
sort(train$score)
#############################也有进行woe替换后进行建模分析的
pre &- predict(glm.fit,test)
#在R中,可以利用pROC包,它能方便比较两个分类器,还能自动标注出最优的临界点,图看起来也比较漂亮。
library(pROC)
modelroc &- roc(test$y,pre)
plot(modelroc, print.auc=TRUE, auc.polygon=TRUE, grid=c(0.1, 0.2),
grid.col=c(&green&, &red&), max.auc.polygon=TRUE,
auc.polygon.col=&skyblue&, print.thres=TRUE)
请跟前文的代码相比较!再上传一张脑图如下:
我的热门文章}

我要回帖

更多关于 信用评分模型变量筛选 的文章

更多推荐

版权声明:文章内容来源于网络,版权归原作者所有,如有侵权请点击这里与我们联系,我们将及时删除。

点击添加站长微信