##################################################
### data, train/test
library(ISLR2)
Gitters <- na.omit(Hitters)
n <- nrow(Gitters)
set.seed(13)
ntest <- trunc(n / 3)
testid <- sample(1:n, ntest)

##################################################
###  linear 
lfit <- lm(Salary ~ ., data = Gitters[-testid, ])
lpred <- predict(lfit, Gitters[testid, ])
with(Gitters[testid, ], mean(abs(lpred - Salary)))

##################################################
###  get y and x including dummies
x <- scale(model.matrix(Salary ~ . - 1, data = Gitters))
y <- Gitters$Salary

##################################################
### lasso

library(glmnet)
cvfit <- cv.glmnet(x[-testid, ], y[-testid],
    type.measure = "mae")
cpred <- predict(cvfit, x[testid, ], s = "lambda.min")
mean(abs(y[testid] - cpred))

plot(cvfit)
plot(cvfit$glmnet.fit)


##################################################
### torch

library(torch)
library(luz) # high-level interface for torch
library(torchvision) # for datasets and image transformation

library(torchdatasets) # for datasets we are going to use
library(zeallot)
torch_manual_seed(13)

##################################################
### nn model

modnn <- nn_module(
  initialize = function(input_size) {
    self$hidden <- nn_linear(input_size, 50)
    self$activation <- nn_relu()
    self$dropout <- nn_dropout(0.4)
    self$output <- nn_linear(50, 1)
  },
  forward = function(x) {
    x %>% 
      self$hidden() %>% 
      self$activation() %>% 
      self$dropout() %>% 
      self$output()
  }
)

## The object modnn has a single hidden layer with 50 hidden units, 
## and a ReLU activation function. 
## It then has a dropout layer, in which a random 40% of the 50 activations 
## from the previous layer are set to zero during each iteration of the 
## stochastic gradient descent algorithm. 
## Finally, the output layer has just one unit with no activation function, 
## indicating that the model provides a single quantitative output.

##################################################
### make x again using pipe notation

x <- model.matrix(Salary ~ . - 1, data = Gitters) %>% scale()

##################################################
### loss and optimizer

modnn <- modnn %>% 
  setup(
    loss = nn_mse_loss(),
    optimizer = optim_rmsprop,
    metrics = list(luz_metric_mae())
  ) %>% 
  set_hparams(input_size = ncol(x))

##################################################
### fit, e.g. run sgd

fitted <- modnn %>% 
  fit(
    data = list(x[-testid, ], matrix(y[-testid], ncol = 1)),
    valid_data = list(x[testid, ], matrix(y[testid], ncol = 1)),
    epochs = 20
  )

## plot sgd fitting
plot(fitted)

##################################################
## predict and loss

## as in lab
npred <- predict(fitted, x[testid, ])
mean(abs(y[testid] - as.matrix(npred)))

## first convert npred to a nice R vector
xlio = torch_tensor(npred,device='cpu')
nnpred = as_array(xlio)
mean(abs(y[testid] - nnpred))

## compare the predictions from linear, lasso, and neural net
pMat = cbind(y[testid],lpred,cpred,nnpred)
colnames(pMat) = c('ytest','linear','lasso','nnet')
pairs(pMat)
print(cor(pMat))

## in summary, the mads:
for(i in 2:4) {
cat(paste0('mad for ',colnames(pMat)[i],' is ',mean(abs(pMat[,1]-pMat[,i])),'\n'))
}


