##################################################
### basic imports 
import matplotlib.pyplot as plt
import numpy as np
import scipy
import pandas as pd
import math
import seaborn as sns; sns.set()

### sklearn model
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier

##roc/auc
from sklearn.metrics import roc_auc_score
## note online sklearn seemed to have plot_roc_curve, but I just found this one
##   I probably need to update my system!!
from sklearn.metrics import roc_curve

## fit simple logit
import statsmodels.api as sm

## lift curve
def mylift(y,p):
   """lift"""
   ii = np.argsort(p)[::-1]
   ps = np.cumsum(y[ii])/np.sum(y)
   return(ps)

##################################################
### data
## train
trainDf = pd.read_csv("http://www.rob-mcculloch.org/data/td1.csv")
ytr = trainDf.iloc[:,0].to_numpy(); xtr = trainDf.iloc[:,1:]
ntr = len(ytr)
## test
testDf = pd.read_csv("http://www.rob-mcculloch.org/data/td2.csv")
yte = testDf.iloc[:,0].to_numpy(); xte = testDf.iloc[:,1:]
nte = len(yte)

##################################################
### logit
XX = sm.add_constant(xtr)
lfitM = sm.Logit(ytr, XX).fit()
XXp = sm.add_constant(xte)
phlgt = lfitM.predict(XXp)

## out of sample lift
pvec = np.linspace(1,nte,nte)/nte
plt.scatter(pvec,mylift(yte,phlgt),s=.5,c='blue')
plt.xlabel('percent sample'); plt.ylabel('percent y=1')
plt.plot(pvec,pvec,c='red')
plt.show()

##auc
auclgt = roc_auc_score(yte,phlgt)
print('auc for logit: ',auclgt)

##roc
roclgt = roc_curve(yte,phlgt)
plt.plot(roclgt[0],roclgt[1])
plt.xlabel('false positive rate'); plt.ylabel('true postive rate')
plt.title('logit auc ' + str(np.round(auclgt,2)))
plt.show()


##################################################
### decision tree
dtm = DecisionTreeClassifier(max_leaf_nodes=50)
dtm.fit(xtr,ytr)
phdt = dtm.predict_proba(xte)[:,1]

## out of sample lift
plt.scatter(pvec,mylift(yte,phdt),s=.5,c='green')
plt.scatter(pvec,mylift(yte,phlgt),s=.5,c='blue')
plt.xlabel('percent sample'); plt.ylabel('percent y=1')
plt.plot(pvec,pvec,c='red')
plt.show()

##auc
aucdt = roc_auc_score(yte,phdt)
print('auc for tree: ',aucdt)

##roc
rocdt = roc_curve(yte,phdt)
plt.plot(rocdt[0],rocdt[1])
plt.xlabel('false positive rate'); plt.ylabel('true postive rate')
plt.title('tree auc ' + str(np.round(aucdt,2)))
plt.show()

##################################################
### random forests
rfm = RandomForestClassifier(random_state=0,n_jobs=-1,n_estimators=500,max_features=2,min_samples_split=20,oob_score=True)
rfm.fit(xtr,ytr)
phrf = rfm.predict_proba(xte)[:,1]

# the OOB score is computed as the number of correctly predicted rows from the out of bag sample.
# not to useful in this application
print("oob score for Random Forests: ",rfm.oob_score_)

## out of sample lift
plt.scatter(pvec,mylift(yte,phdt),s=.5,c='green')
plt.scatter(pvec,mylift(yte,phlgt),s=.5,c='blue')
plt.scatter(pvec,mylift(yte,phrf),s=.5,c='magenta')
plt.xlabel('percent sample'); plt.ylabel('percent y=1')
plt.plot(pvec,pvec,c='red')
plt.show()

##auc
aucrf = roc_auc_score(yte,phrf)
print('auc for random forests: ',aucrf)

##roc
rocrf = roc_curve(yte,phrf)
plt.plot(rocrf[0],rocrf[1])
plt.xlabel('false positive rate'); plt.ylabel('true postive rate')
plt.title('random forests auc ' + str(np.round(aucrf,2)))
plt.show()

##################################################
### gradient boosting

## let's try two different settings for boosting
# first setting
gbm = GradientBoostingClassifier(learning_rate=.01,n_estimators=1000,max_depth=4)
gbm.fit(xtr,ytr)
phgb = gbm.predict_proba(xte)[:,1]

# second setting
gbm1 = GradientBoostingClassifier(learning_rate=.1,n_estimators=1000,max_depth=4)
gbm1.fit(xtr,ytr)
phgb1 = gbm1.predict_proba(xte)[:,1]

## out of sample lift
plt.scatter(pvec,mylift(yte,phdt),s=.5,c='green')
plt.scatter(pvec,mylift(yte,phlgt),s=.5,c='blue')
plt.scatter(pvec,mylift(yte,phrf),s=.5,c='magenta')
plt.scatter(pvec,mylift(yte,phgb),s=.5,c='black')
plt.scatter(pvec,mylift(yte,phgb1),s=.5,c='grey')
plt.xlabel('percent sample'); plt.ylabel('percent y=1')
plt.plot(pvec,pvec,c='red')
plt.show()

##auc
aucgb = roc_auc_score(yte,phgb)
print('auc for tree: ',aucgb)

##roc
rocgb = roc_curve(yte,phgb)
plt.plot(rocgb[0],rocgb[1])
plt.xlabel('false positive rate'); plt.ylabel('true postive rate')
plt.title('boosting auc ' + str(np.round(aucgb,2)))
plt.show()

##################################################
### auc
aucv = np.array([auclgt,aucdt,aucrf,aucgb])
print('auc: (logit, tree, rf, gb)',np.round(aucv,2))