##################################################
### basic imports 
import matplotlib.pyplot as plt
import numpy as np
import scipy
import pandas as pd
import math
import seaborn as sns; sns.set()

### random number generator
from numpy.random import default_rng

### sklearn model
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.metrics import mean_squared_error

def myrmse(y,yhat):
   """ print out rmse with 3 digits"""
   rmse = math.sqrt(mean_squared_error(y,yhat))
   return(np.round(rmse,3))

##################################################
### data
##cadat = pd.read_csv("http://www.rob-mcculloch.org/data/calhouse.csv") # worked in spring 21!!
cadat = pd.read_csv("https://bitbucket.org/remcc/rob-data-sets/downloads/calhouse.csv")
n = cadat.shape[0] #sample size
p = cadat.shape[1]-1
y = cadat['logMedVal'].to_numpy()
x = cadat.iloc[:,:9].to_numpy()
print('n,p: ',n,p)

##################################################
### train, val, test
rng = np.random.default_rng(seed=34) # Auston Matthews
ii = rng.choice(range(n),size=n,replace=False)

n1 = math.floor(n/2.0) # half the data in train
n2 = math.floor(n/4.0) # quarter of the data in train
n3 = n-n1-n2
ii1 = ii[:n1]; x1 = x[ii1]; y1 = y[ii1] #train
ii2 = ii[n1 + np.arange(n2)]; x2 = x[ii2]; y2 = y[ii2] #val
ii3 = ii[n1 + n2 + np.arange(n3)]; x3 = x[ii3]; y3 = y[ii3] #test

## let's test that the train/val/test split worked by recombining and comparing
# regression results
lmf = LinearRegression()
lmf.fit(x,y)
print(lmf.coef_)
xx = np.vstack([x1,x2,x3]); yy = np.concatenate([y1,y2,y3])
lmf.fit(xx,yy)
print(lmf.coef_)

##################################################
## fit rf on train, predict on val
# use 500 trees and 3 x's
rfm = RandomForestRegressor(random_state=0,n_jobs=-1,n_estimators=500,max_features=3)
rfm.fit(x1,y1) # fit on train
## predict on val
yhrf = rfm.predict(x2)

plt.scatter(yhrf,y2,c='blue',s=.5)
plt.xlabel('yhat on validation from rf on train');plt.ylabel('validation y')
plt.show()

print('rmse from rf, fit on train, predict on val: ',myrmse(y2,yhrf))

##################################################
### important features from Random Forests

## use train and val data
x12 = np.vstack([x1,x2]); y12 = np.concatenate([y1,y2])
rfm.fit(x12,y12)

## variable importances
fimp = rfm.feature_importances_
iifi = np.argsort(fimp)[::-1] #order (max first) of feature importances
fnms = cadat.iloc[:,:9].columns.values #feature names
print("variables in order of importance:",fnms[iifi])

## plot variable importances
fnmS = np.array([fnms[i][:6] for i in range(p)]) #truncate names so the fit in plot
plt.bar(range(p),fimp[iifi],color='green')
plt.xticks(range(p),fnmS[iifi],rotation=90)
plt.title('variable importances')
plt.show()

##################################################
### try boosting
gbm = GradientBoostingRegressor(learning_rate=.2,n_estimators=5000,max_depth=4)
gbm.fit(x1,y1)
yhgb = gbm.predict(x2)

## plot boosting prediction on val
plt.scatter(yhgb,y2,c='blue',s=.5)
plt.xlabel('yhat on validation from gb on train');plt.ylabel('validation y')
plt.show()

## compare boosting to rf 
yhdf = pd.DataFrame({'y2':y2,'yhrf':yhrf,'yhgb':yhgb})
print(yhdf.corr())


print('rmse: rf, gb:',myrmse(y2,yhrf),myrmse(y2,yhgb))
"very similar !!"

##################################################
### let's fit gb on train+val, predict on test

## use train and val data
gbm.fit(x12,y12)
yhgb3 = gbm.predict(x3)
print('rmse on test from gradient boosting: ',myrmse(y3,yhgb3))
#very similar to what we had before

## plot predictions on test
plt.scatter(y3,yhgb3,c='green',s=.5)
plt.xlabel('test y'); plt.ylabel('test predictions from gb on train+val')
plt.show()

## variable importances
fimp = gbm.feature_importances_
iifi = np.argsort(fimp)[::-1] #order (max first) of feature importances
fnms = cadat.iloc[:,:9].columns.values #feature names
print("variables in order of importance:",fnms[iifi])

