#dpl=FALSE
dpl=TRUE
source("notes-funs.R")
######################################################################
if(1) {cat("### librarys\n")
library(textir) # this has the w8there data set
library(glmnet)
data(we8there)
}

######################################################################
if(1) {cat("### process data\n")
#each observation is a restaurant rating. 6166 ratings.
# have overall rating and counts for bi-grams.
head(we8thereRatings) # we will get y from these
print(dim(we8thereCounts)) #x is number of times a term is in the review.
we8thereCounts[1, we8thereCounts[1, ]!=0]  # words in the first review


# transform rating into binary y
y = ifelse(we8thereRatings$Overall>3, 1, 0)
y = as.factor(y)
}
######################################################################
if(1) {cat("### fit Ridge\n")

# fit Ridge
cvfit = cv.glmnet(x = we8thereCounts, y = y,
                    family = "binomial",
                    alpha = 0,                        # lasso - 1, ridge - 0
                    nfold = 10
                 )

# plot Ridge
if(dpl) pdf(file="ridgefit.pdf",height=12,width=12)
par(mfrow=c(2,1))
plot(cvfit)
plot(cvfit$glmnet.fit)
dev.off()
}
######################################################################
if(1) {cat("### look at ridge coefs\n")

#look at coefficients, which are pos/neg, interpret?
coefR = coef(cvfit$glmnet.fit, s=cvfit$lambda.1se)
oo = order( coefR, decreasing = TRUE )

# positive coefficients
sink("w8-ridge-coefs.rtxt")
cat("\\begin{verbatim}\n")
ncoef=30
cat("Big positive coefficients:\n")
print(coefR@Dimnames[[1]][oo[1:ncoef]])
print(coefR[oo[1:ncoef]])
# negative coefficients
cat("\n\nBig negative coefficients:\n")
print(coefR@Dimnames[[1]][tail(oo,ncoef)])
print(coefR[tail(oo,ncoef)])
cat("\\end{verbatim}\n")
sink()
}

######################################################################
if(dpl) rm(list=ls())
