#dpl=FALSE
dpl=TRUE
source("notes-funs.R")
######################################################################
if(1) {cat("### librarys\n")
library(textir) # this has the w8there data set
library(glmnet)
data(we8there)
}

######################################################################
if(1) {cat("### process data\n")
#each observation is a restaurant rating. 6166 ratings.
# have overall rating and counts for bi-grams.
head(we8thereRatings) # we will get y from these
print(dim(we8thereCounts)) #x is number of times a term is in the review.
we8thereCounts[1, we8thereCounts[1, ]!=0]  # words in the first review


# transform rating into binary y
y = ifelse(we8thereRatings$Overall>3, 1, 0)
y = as.factor(y)

printfl(table(we8thereRatings$Overall),dpl,"overallratings.rtxt")
printfl(table(y),dpl,"y.rtxt")
printfl(dim(we8thereCounts),dpl,"dimx.rtxt")
printfl(we8thereCounts[1, we8thereCounts[1, ]!=0],dpl,"first-x-row.rtxt")
}
######################################################################
if(1) {cat("### fit lasso\n")

# fit Lasso
cvfit = cv.glmnet(x = we8thereCounts, y = y,
                    family = "binomial",
                    alpha = 1,                        # lasso - 1, ridge - 0
                    nfold = 10
                 )

# plot Lasso
if(dpl) pdf(file="lassofit.pdf",height=12,width=12)
par(mfrow=c(2,1))
plot(cvfit)
plot(cvfit$glmnet.fit)
dev.off()
}
######################################################################
if(1) {cat("### look at lasso coefs\n")

if(dpl) pdf(file="lasso-numzero.pdf",height=10,width=14)
p=ncol(we8thereCounts)
plot(cvfit$glmnet$lambda,cvfit$glmnet$df)
abline(v=cvfit$lambda.1se,col="red")
abline(v=cvfit$lambda.min,col="blue")
dev.off()

#look at coefficients, which are pos/neg, interpret?
coefL = coef(cvfit$glmnet.fit, s=cvfit$lambda.1se)
oo = order( coefL, decreasing = TRUE )

# positive coefficients
sink("w8-lasso-coefs.rtxt")
cat("\\begin{verbatim}\n")
ncoef=30
cat("Big positive coefficients:\n")
print(coefL@Dimnames[[1]][oo[1:ncoef]])
print(coefL[oo[1:ncoef]])
# negative coefficients
cat("\n\nBig negative coefficients:\n")
print(coefL@Dimnames[[1]][tail(oo,ncoef)])
print(coefL[tail(oo,ncoef)])
cat("\\end{verbatim}\n")
sink()


}

######################################################################
if(dpl) rm(list=ls())
