##################################################
### imports

##basic
import numpy as np
import pandas as pd
import math

## graphics
import matplotlib.pyplot as plt
%matplotlib

## sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LassoCV

## keras, will get from tensoflow
import tensorflow as tf

import random

## loss functions
def rmsef(y,yp):
   return(np.sqrt(np.mean((y-np.squeeze(yp))**2)))
def madf(y,yp):
   return(np.mean(np.abs(y-np.squeeze(yp))))

##################################################
### read in data

## same train/test split as in ISLR lab
hdtr = pd.read_csv("https://bitbucket.org/remcc/rob-data-sets/downloads/Gitters_train.csv")
hdte = pd.read_csv("https://bitbucket.org/remcc/rob-data-sets/downloads/Gitters_test.csv")
print(hdtr.shape) 
print(hdte.shape) 
print(hdtr.columns) # names of variables
ntr = hdtr.shape[0] #sample size = number of rows
p = hdtr.shape[1]-1 #number of features = number of columns -1 , last column is y=salary

## get X and y as simple numpy arrays
# train
hdtrnp = hdtr.to_numpy()
Xtr = hdtrnp[:,:p] #first p columns
ytr = hdtrnp[:,-1] #last column

# test
hdtenp = hdte.to_numpy()
Xte = hdtenp[:,:p] #first p columns
yte = hdtenp[:,-1] #last column

## check it is already scaled, should scale using just train, but this is the way ISLR did it
X = np.vstack([Xtr,Xte])
print('feature means (should be 0):')
print(np.mean(X,axis=0))
print('feature sd (should be 1):')
print(np.std(X,axis=0))


##################################################
## fit linear on train, predict on test

lmmod = LinearRegression()
lmmod.fit(Xtr,ytr)
ypredlin = lmmod.predict(Xte)
yhatlin = lmmod.predict(Xtr)

print(f'out of sample test rmse from linear model is {rmsef(yte,ypredlin):0.4f}')
print(f'out of sample test mad from linear model is {madf(yte,ypredlin):0.4f}')

## check
print(f'check: rmse from sklearn mse fun: {math.sqrt(mean_squared_error(yte,ypredlin)):0.4f}')

##plot out of sample prediction
plt.scatter(yte,ypredlin)
plt.plot(yte,yte,c='r')
plt.xlabel('test y'); plt.ylabel('linear model predictions')
plt.title('linear out of sample')

##plot in sample prediction, you can see why they use MAD as well as MSE
plt.scatter(ytr,yhatlin)
plt.plot(ytr,ytr,c='r')
plt.xlabel('train y'); plt.ylabel('linear model fit')
plt.title('linear in sample')

##################################################
## fit LASSO on train, predict on test

## LassoCV seems to have mse hard coded in, 
##    unlike glmnet which allows us to choose cv loss with type.measure parameter.
lcv = LassoCV(cv=10)
lcv.fit(Xtr,ytr)

#best alpha and coefficients
print("best alpha: ",lcv.alpha_)

#coefficents
print("coeficients at best alpha: ",lcv.coef_)
print("number of 0 coefficents: ",np.sum(lcv.coef_ == 0))

#predicted values
ypredL = lcv.predict(Xte)
yhatL = lcv.predict(Xtr)

print(f'out of sample test rmse from linear model is {rmsef(yte,ypredlin):0.4f}')
print(f'out of sample test mad from linear model is {madf(yte,ypredlin):0.4f}')
print(f'out of sample test rmse from LASSO is {rmsef(yte,ypredL):0.4f}')
print(f'out of sample test mad from LASSO is {madf(yte,ypredL):0.4f}')

## compare LASSO to linear
plt.scatter(ypredL,ypredlin)
plt.xlabel('LASSO predictions'); plt.ylabel('linear predictions')
plt.plot(ypredL,ypredL,c='r')

##################################################
### single layer, 50 units, relu, dropout
seed=34
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed) ## ? just need this one ??

## make model
nunit =  50
nx = Xtr.shape[1] # number of x's
nn1 = tf.keras.models.Sequential()

## add one hidden layer and dropout, one linear output
nn1.add(tf.keras.Input(shape=(nx,)))
nn1.add(tf.keras.layers.Dense(units=nunit,activation='relu',))
nn1.add(tf.keras.layers.Dropout(.4))
nn1.add(tf.keras.layers.Dense(units=1))

#compile model
nn1.compile(loss='mse',optimizer='rmsprop',metrics=['mean_absolute_error'])

# fit
nepoch = 1500
nhist = nn1.fit(Xtr,ytr,epochs=nepoch,verbose=1,batch_size=32,validation_data=(Xte,yte))

# summary
nn1.summary()

##################################################
### plot training by epoch
yhatnn = nn1.predict(Xtr)
yprednn = nn1.predict(Xte)

trL = np.sqrt(nhist.history['loss'])
teL = np.sqrt(nhist.history['val_loss'])
epind = range(1,len(trL)+1)
plt.plot(epind,trL,c='red',label='train')
plt.plot(epind,teL,c='blue',label='validation')
plt.xlabel('epoch'); plt.ylabel('rmse')
plt.axhline(rmsef(yte,ypredlin),linestyle='--',label='linear val rmse')
plt.legend()
minrmse = teL.min()
plt.title(f'training, min validation rmse is {minrmse:0.2f}')

##################################################
### plot and rmse/mad

# out-of-sample (test)
plt.scatter(yte,yprednn,c='blue',s=30,label='neural net')
plt.scatter(yte,ypredlin,c='green',s=20,label='linear')
plt.plot(yte,yte,c='r')
plt.xlabel('test y'); plt.ylabel('predictions')
plt.legend()

# in-sample (train)
plt.scatter(ytr,yhatnn)
plt.plot(ytr,ytr,c='r')
plt.xlabel('train y'); plt.ylabel('nn fits')

plt.scatter(yhatnn,yhatlin)
plt.plot(yhatnn,yhatnn,c='r')
plt.xlabel('nn fits'); plt.ylabel('linear fits')

print(f'out of sample test rmse from linear model is {rmsef(yte,ypredlin):0.4f}')
print(f'out of sample test rmse from LASSO is {rmsef(yte,ypredL):0.4f}')
print(f'out of sample test rmse from neural net is {rmsef(yte,yprednn):0.4f}')
print('\n\n')

print(f'out of sample test mad from linear model is {madf(yte,ypredlin):0.4f}')
print(f'out of sample test mad from LASSO is {madf(yte,ypredL):0.4f}')
print(f'out of sample test mad from neural net is {madf(yte,yprednn):0.4f}')