###################################################################### ### do diabetes with forwards stepwise ###################################################################### #dpl=FALSE dpl=TRUE library(MASS) source("do-stepcv.R") ###################################################################### ############################################################ if(1) {cat("### read in data\n") ddf = read.csv("diabetes.csv") n=nrow(ddf) } ############################################################ if(1) {cat("### try extractAIC\n") #linear in 10x's fmc = paste("y~",paste0(names(ddf)[2:11],collapse="+")) lmnotran = lm(as.formula(fmc),ddf) bicnotran = extractAIC(lmnotran,k=log(n)) #k=log(n) is bic print(summary(lmnotran)) #all 64 x's lmtran =lm(y~.,ddf) bictran = extractAIC(lmtran,k=log(n)) print(summary(lmtran)) #print out bic's cat("bic: without trans:",bicnotran,", with trans: ",bictran,"\n") } ############################################################ if(1) {cat("### try stepAIC\n") ##do forwards stepAIC, stopping when adding any var makes BIC bigger. nullmod = lm(y~1,ddf) #start with this model fullmod = lmtran #add variables from this model #trace=0 suppresses output as it runs #start at nullmod, add variables from lmtran #k=2 means AIC e.g. penalty is k*numpars fwd = stepAIC(nullmod,scope=formula(lmtran),direction="forward",k=log(n),trace=0) print(summary(fwd)) bicfwd = extractAIC(fwd,k=log(n)) cat("bic for forward model is: ",bicfwd,"\n") print(summary(fwd)) } ############################################################ if(1) {cat("### do stepAIC and keep summaries of model as you go\n") #keep vars and mse ypred = ddf$y keepf = function(mod,maic) { n = length(ypred) rval = list() k = length(mod$coef) rval$varname = names(mod$coef)[k] yhat = mod$fitted rval$mse = mean((yhat-ypred)^2) rval$Rsq = summary(mod)$r.squared rval$bic = extractAIC(mod,k=log(n)) return(rval) } nstep=25 fwd = stepAIC(nullmod,scope=formula(lmtran),direction="forward", k=0,trace=0,keep=keepf,steps=nstep) nmii = 1+4*(1:(nstep)) vnms = unlist(fwd$keep[nmii]); msev = unlist(fwd$keep[nmii+1]) Rsqv= unlist(fwd$keep[nmii+2]); bicv = unlist(fwd$keep[nmii+3]) print(vnms) print(sqrt(msev)) print(Rsqv) print(bicv) if(dpl) pdf("r2-mse.pdf",height=5,width=12) par(mfrow=c(1,2)) plot(Rsqv,xlab="num vars",ylab="R-squared",cex.lab=1.5,cex.axis=1.5,type="b") plot(sqrt(msev),xlab="num vars",ylab="RMSE",cex.lab=1.5,cex.axis=1.5,type="b") dev.off() print(vnms) if(dpl) pdf("bic.pdf",height=8,width=12) iibic=seq(from=1,to=(2*nstep) -1,by=2) plot(bicv[iibic],bicv[iibic+1],xlab="num param",ylab="BIC",cex.lab=1.5,cex.axis=1.5,type="b") dev.off() } ############################################################ if(1) {cat("### do k-fold with forwards step\n") source("do-stepcv.R") yind=1 xind=2:ncol(ddf) set.seed(99) folds = getfolds(10,nrow(ddf)) nstep=25 stcv = stepcv(ddf,yind,xind,formula(lmtran),folds,nstep) rmse=sqrt(apply(stcv,1,sum)/n) plot(0:nstep,rmse,xlab="num var",ylab="RMSE", cex.lab=1.5,cex.axis=1.5,type="b",col="red",pch=16) ##do several runs nstep=20 ntry=50 set.seed(99) rmseM = matrix(0.0,nstep+1,ntry) for(i in 1:ntry) { cat("#### doing try: ",i,"\n") folds = getfolds(5,nrow(ddf)) stcv = stepcv(ddf,yind,xind,formula(lmtran),folds,nstep) rmseM[,i] = sqrt(apply(stcv,1,sum)/n) } if(dpl) pdf(file="step-cv.pdf",height=8,width=12) plot(c(0,nstep),range(rmseM),type="n", xlab="number of variables",ylab="RMSE",cex.lab=1.5,cex.axis=1.5) for(i in 1:ntry) lines(0:nstep,rmseM[,i],col=i) lines(0:nstep,apply(rmseM,1,mean),type="b",lwd=3,col="black",pch=16) if(dpl) dev.off() xmin = which.min(apply(rmseM,1,mean))-1 cat("vars from forward and cv:\n") print(vnms[1:xmin]) } ###################################################################### if(dpl) rm(list=ls())