
##################################################
### imports
import numpy as np
import pandas as pd
import math
import scipy as sp

import matplotlib.pyplot as plt
#ipython terminal
#%matplotlib
#jupyter notebook
#%matplotlib inline 


from sklearn.linear_model import LinearRegression

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV

from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV

from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

##################################################
## boston data

bd = pd.read_csv("https://bitbucket.org/remcc/rob-data-sets/downloads/Boston.csv")
print("*** the type of bd is:")
print(type(bd))
print(bd.head())

## to to numpy
bdnp = bd.to_numpy()
y = bdnp[:,-1]
X = bdnp[:,0:-1]

##check
plt.scatter(X[:,-1],y)

##################################################
## scale, note THIS IS DATA LEAKAGE,  for a pipeline version see below.
scl = StandardScaler()
Xs = scl.fit_transform(X)
print("means should be 0, sds should be 1")
print(Xs.mean(axis=0))
print(Xs.std(axis=0))

##################################################
## simple regression
lmod = LinearRegression()
lmod.fit(Xs,y)

yhatl = lmod.predict(Xs)

### plot y vs yhat
plt.scatter(y,yhatl)
plt.xlabel('y'); plt.ylabel('yhat')
plt.plot(y,y,c='red',linestyle='dotted')

##################################################
## ridge
alphas = np.linspace(start=1,stop=200,num=100)
rcv = RidgeCV(alphas,cv=10)
rcv.fit(Xs,y)
print(rcv.alpha_)
print(rcv.coef_)
yhatr = rcv.predict(Xs)



##################################################
## lasso
lcv = LassoCV(cv=5)
lcv.fit(Xs,y)

# look at alphas used
print("number of alphas used:",lcv.n_alphas)
pd.Series(lcv.alphas_).describe()

#best alpha and coefficients
print("best alpha: ",lcv.alpha_)

#coefficents
print("coeficients at best alpha: ",lcv.coef_)
print("number of 0 coefficents: ",np.sum(lcv.coef_ == 0))

#fitted values
yhatL = lcv.predict(Xs)

#mse
msep = lcv.mse_path_
mses = msep.sum(axis=1)
plt.scatter(np.log(lcv.alphas_),mses)
plt.xlabel('log alpha'); plt.ylabel('mse')

##################################################
## look
pyhat = pd.DataFrame({'y':y,'yhatlin':yhatl,'yhatridge':yhatr,'yhatLasso':yhatL})
pyhat.corr()

##################################################
## Lasso path
Lpath = lcv.path(Xs,y)
kk = 12 ## choose which coefficient to plot, 12 is medv
plt.scatter(Lpath[0][-30:-1],Lpath[1][kk,-30:-1]) # 0 is alphas, [1] is nvar x nalphas coefficent path, 
plt.xlabel('alpha'); plt.ylabel('coefficent')

print(pd.Series(Lpath[1][kk,:]).describe())

##################################################
### use pipeline to avoid data leakage

# Create a pipeline that scales ONLY within the cross-validation folds
Pln = Pipeline([
    ('scaler', StandardScaler()),  # Scales data within each cross-validation fold
    ('lasso', LassoCV(
        cv=10,            # 10-fold cross-validation
        random_state=34
    ))
])

## fit/predict
Pln.fit(X, y)
yhatPln = Pln.predict(X)

plt.scatter(y,yhatPln)
plt.plot(y,y,c='red')

## look
pyhat = pd.DataFrame({'y':y,'yhatlin':yhatl,'yhatridge':yhatr,'yhatLasso':yhatL,'yhatPln':yhatPln})
pyhat.corr()

print(Pln.named_steps.keys())
junk = Pln.named_steps['lasso']
print(junk.alpha_)
print(junk.coef_)

##################################################
### compare coefficents

plt.scatter(lmod.coef_,lcv.coef_,c='green',label='lasso')
plt.scatter(lmod.coef_,rcv.coef_,c='blue',label='ridge')
plt.plot(lmod.coef_,lmod.coef_,c='red')
plt.xlabel('linear coefficents'); plt.ylabel('shrunk coefficents')
plt.legend()







