##################################################
### librairies
library(rpart)
library(MASS)
library(rpart.plot)
data(Boston)
attach(Boston)

##################################################
### pick off some of the variables
ddf=Boston[,c(8,13,14)] #pick off dis,lstat,medv
print(names(ddf))

##################################################
###fit a single tree and plot variable importance
##fit a big tree using rpart.control
set.seed(99)
big.tree = rpart(medv~.,method="anova",data=ddf,control=rpart.control(minsplit=5,cp=.0005))
nbig = length(unique(big.tree$where))
cat("size of big tree: ",nbig,"\n")

##################################################
###look at CV results
plotcp(big.tree)

##################################################
### get nice tree from CV results
iibest = which.min(big.tree$cptable[,"xerror"]) #which has the lowest error
bestcp=big.tree$cptable[iibest,"CP"]
bestsize = big.tree$cptable[iibest,"nsplit"]+1
# note: in cptable:
# CP is the complexity parameter
# rel error is the average deviance of the current tree
# divided by the average deviance of the null tree (in-sample fit).
# xerror is based on a 10-fold cross-validation and is again measured
# relative to the deviance of the null model (out-of sample fit).

#prune to good tree
best.tree = prune(big.tree,cp=bestcp)
nbest = length(unique(best.tree$where))
cat("size of best tree: ", nbest,"\n")

##################################################
### plot tree
plot(best.tree,uniform=TRUE)
text(best.tree,digits=4,use.n=TRUE)

## using rpart.plot
rpart.plot(best.tree,split.cex=1.0,cex=1.3,type=3,extra=0)

##################################################
###get fits
yhat = predict(best.tree)
plot(Boston$medv,yhat)
abline(0,1,col="red",lwd=3)



