##################################################
### imports
import pandas as pd
import numpy as np

from sklearn import datasets
from sklearn.model_selection import  cross_val_score
from sklearn.model_selection import  validation_curve

from xgboost import XGBRegressor

# %matplotlib
import matplotlib.pyplot as plt

##################################################
### data
##  X,y = datasets.load_diabetes(return_X_y=True)
## There is a version of the diabetes data in sklearn, but let's use the version for Rob data, which is from Hastie.
##  print("X is:")
##  print(X.shape)
##  print("y is:")
##  print(y.shape)

#ddf = pd.read_csv("diabetes.csv") # from Rob data sets webpage
ddf = pd.read_csv("https://bitbucket.org/remcc/rob-data-sets/downloads/diabetes.csv") # from Rob data sets webpage
xvnms = ddf.columns.values[1:11]
yX = ddf.to_numpy()
y = yX[:,0]
X = yX[:,1:11]

print('Shape of X:\n')
print(X.shape)
print('Shape of y:\n')
print(y.shape)

##################################################
### xg boost model

xgbmod = XGBRegressor(booster='gbtree',objective='reg:squarederror',
   max_depth=2, learning_rate=0.1, n_estimators=100, random_state=2, n_jobs=-1)
xgbmod.fit(X,y)
fimp = pd.DataFrame({'nms':xvnms,'imp':xgbmod.feature_importances_})
print(fimp)

##plot fimp

plt.plot(fimp['imp'],c='blue')
plt.scatter(range(10),fimp['imp'],c='r')
plt.xticks(range(10),fimp['nms'],fontsize=14)

# or, sorted
ii = np.argsort(-fimp['imp'])
plt.plot(range(10),fimp['imp'][ii],c='blue')
plt.scatter(range(10),fimp['imp'][ii],c='r')
plt.xticks(range(10),fimp['nms'][ii],fontsize=14)

# note varimp does not look correct compared to BART!!!

##################################################
### crofimp['imp'][ii]ss-val

cvres = cross_val_score(xgbmod,X,y,scoring='neg_mean_squared_error',cv=5)

# tranform to rmse
rmse = np.sqrt(np.mean(-cvres)) 
print('the rmse with 5-fold is:', np.round(rmse,3))

##################################################
###  cross validation on a grid of learning_rate values using sklearn validation_curve function

## d=2
xgbmod = XGBRegressor(booster='gbtree',objective='reg:squarederror',
   max_depth=2, learning_rate=0.1, n_estimators=500, random_state=2, n_jobs=-1)

# do cv at every value of  lr in lrv
lrv = np.linspace(start=.001,stop=.03,num=100)
trainS, testS = validation_curve(xgbmod,X,y,param_name='learning_rate',param_range=lrv,cv=10,scoring='neg_mean_squared_error')

# transform neg_mean_squared_error to rmse
trrmse = np.sqrt(-trainS.mean(axis=1))
termse = np.sqrt(-testS.mean(axis=1))

#plot in and out of sample rmse
plt.scatter(lrv,termse,c='blue',s=5)
plt.scatter(lrv,trrmse,c='red',s=5)
plt.xlabel("learning-rate"); plt.ylabel("rmse")
plt.legend(['test, d=2','train, d=2'])
#plt.savefig("xgb-diabetes_d2_train-test.pdf")

print(f'the min rmse with d=2 is {np.round(np.min(termse),3)}')

## d=3
xgbmod1 = XGBRegressor(booster='gbtree',objective='reg:squarederror',
   max_depth=3, learning_rate=0.1, n_estimators=500, random_state=2, n_jobs=-1)

# do cv at every value of  lr in lrv
lrv = np.linspace(start=.001,stop=.03,num=100)
#lrv = np.linspace(start=.001,stop=.04,num=200)
trainS1, testS1 = validation_curve(xgbmod1,X,y,param_name='learning_rate',param_range=lrv,cv=10,scoring='neg_mean_squared_error')

# transform neg_mean_squared_error to rmse
trrmse1 = np.sqrt(-trainS1.mean(axis=1))
termse1 = np.sqrt(-testS1.mean(axis=1))

#plot in and out of sample rmse
plt.scatter(lrv,termse1,c='blue',s=5)
plt.scatter(lrv,trrmse1,c='red',s=5)
print(f'the min rmse with d=3 is {np.round(np.min(termse1),3)}')

## plot d=2 and 3
plt.scatter(lrv,termse,c='blue',s=5)
plt.scatter(lrv,termse1,c='green',s=5)
plt.xlabel("learning-rate"); plt.ylabel("test rmse")
plt.legend(['d=2','d=3'])
#plt.plot(lrv,trrmse,c='red',s=5)
#plt.plot(lrv,trrmse1,c='orange',s=5)
#plt.show()
#plt.savefig("xgb_d23.pdf")

## d=4
xgbmod2 = XGBRegressor(booster='gbtree',objective='reg:squarederror',
   max_depth=4, learning_rate=0.1, n_estimators=500, random_state=2, n_jobs=-1)

# do cv at every value of  lr in lrv
lrv = np.linspace(start=.001,stop=.03,num=100)
#lrv = np.linspace(start=.001,stop=.04,num=200)
trainS2, testS2 = validation_curve(xgbmod2,X,y,param_name='learning_rate',param_range=lrv,cv=10,scoring='neg_mean_squared_error')

# transform neg_mean_squared_error to rmse
trrmse2 = np.sqrt(-trainS2.mean(axis=1))
termse2 = np.sqrt(-testS2.mean(axis=1))

#plot in and out of sample rmse
plt.scatter(lrv,termse1,c='blue',s=5)
plt.scatter(lrv,trrmse1,c='red',s=5)
plt.scatter(lrv,trrmse1,c='magenta',s=5)
print(f'the min rmse with d=3 is {np.round(np.min(termse1),3)}')

## plot d=2 and 3
plt.scatter(lrv,termse,c='blue',s=5)
plt.scatter(lrv,termse1,c='green',s=5)
plt.scatter(lrv,termse2,c='magenta',s=5)
plt.xlabel("learning-rate"); plt.ylabel("test rmse")
plt.legend(['d=2','d=3','d=4'])

## d=1
xgbmod3 = XGBRegressor(booster='gbtree',objective='reg:squarederror',
   max_depth=1, learning_rate=0.1, n_estimators=500, random_state=2, n_jobs=-1)

# do cv at every value of  lr in lrv
lrv = np.linspace(start=.001,stop=.03,num=100)
#lrv = np.linspace(start=.001,stop=.04,num=200)
trainS3, testS3 = validation_curve(xgbmod3,X,y,param_name='learning_rate',param_range=lrv,cv=10,scoring='neg_mean_squared_error')

# transform neg_mean_squared_error to rmse
trrmse3 = np.sqrt(-trainS3.mean(axis=1))
termse3 = np.sqrt(-testS3.mean(axis=1))

#plot in and out of sample rmse
plt.scatter(lrv,termse3,c='blue',s=5)
plt.scatter(lrv,trrmse3,c='red',s=5)
print(f'the min rmse with d=3 is {np.round(np.min(termse1),3)}')

## plot d=2 and 3
plt.scatter(lrv,termse,c='blue',s=5)
plt.scatter(lrv,termse1,c='green',s=5)
plt.scatter(lrv,termse2,c='magenta',s=5)
plt.scatter(lrv,termse3,c='red',s=5)
plt.xlabel("learning-rate"); plt.ylabel("test rmse")
plt.legend(['d=2','d=3','d=4','d=1'])