
## read in the data
cd = read.csv("https://bitbucket.org/remcc/rob-data-sets/downloads/susedcars.csv")

##  only price, mileage, and year are numberic
sapply(cd,is.numeric)

# for example, let's look at trim
table(cd$trim)

#let's make the categorical variables factors in R
iifac = c(2,3,6,7)
for(i in iifac) cd[,i] = as.factor(cd[,i])
summary(cd)

#Let’s try a simple regression:
# R will automatically dummy up the factor trim
sreg = lm(price~mileage+trim,cd)
summary(sreg)

# we can see the dummies just give a different intercept for linear relationship
#    between price and mileage for each level of trim
plot(cd$mileage,sreg$fitted.values,col=as.integer(cd$trim))

# Now let’s try the LASSO. We will use the package glmnet.
library(glmnet)

#glmnet does not use the “formula approach”. 
#You have to give it a matrix x and response vector y corresponding to the model y=Xbeta + epsilon.

#For our simple example above, this means we need the matrix with all the dummies. 
#R has a simple way of getting the x matrix given the model formula using the model.matrix function.

x = model.matrix(price~mileage+trim,cd)[,-1] # [,-1] to drop the first column which is the one vector
print(dim(x))
head(x)

#We can check we have the dummies by running the regression directly with x:
tdf = data.frame(x,y = cd$price)
tlm = lm(y~.,tdf)
print(summary(tlm$fitted.values - sreg$fitted.values))

#Now we are ready to try using glmnet to run the LASSO. 
#Note that the vignette for glmnet is pretty good (see > browseVignettes() in R.)
#We will use all the x variables so I will call model.matrix again but this time use the data frame cd. 
#We first call cv.glmnet. This will do the cross validation needed to choose lambda.

y = cd$price
x = model.matrix(price~.,cd)[,-1]
set.seed(14) # Dave Keon
cars.gcv = cv.glmnet(x, y, type.measure = "mse", nfolds = 10,alpha=1)
plot(cars.gcv)

## let's get the good lambda values
lmin = cars.gcv$lambda.min
l1se = cars.gcv$lambda.1se
cat("lambda min and lambda 1se are: ",lmin,l1se,"\n")


#Let’s plot the RMSE.
crmse = sqrt(cars.gcv$cvm) #glmnet use mse
plot(crmse)
cat("the min rmse is: ",min(crmse),"\n")

#Now lets get the actual LASSO fit. 
#We can run the function glmnet or we can get the fit on all the data from the cv results. 
#Let’s just pull it out of the cv results and have a look at the coefficients.

cars.las = cars.gcv$glmnet.fit
plot(cars.las)

## get the 1se coefficients
bhatL1 = coef(cars.las,s=l1se)[,1] #[,1] gets rid of sparse matrix format
names(bhatL1[abs(bhatL1)==0]) #which one are 0??

## min mse lambda
bhatLm = coef(cars.las,s=lmin)[,1] #[,1] gets rid of sparse matrix format
names(bhatLm[abs(bhatLm)==0]) #which one are 0??

##Now let’s get some predictions. 
##We’ll predict at our train x, but of course we would be using different x if we were predicting out of sample.

fmat = predict(cars.las,newx=x,s=c(lmin,l1se))
fmat = cbind(y,fmat)
colnames(fmat) = c("y","yhatmin","yhat1se")
pairs(fmat)


