##################################################
## imports

## basic imports
import numpy as np
import pandas as pd
import math
import scipy as sp
import matplotlib.pyplot as plt
import seaborn; seaborn.set()

##keras
from keras import models
from keras import layers
from keras.datasets import imdb
from keras.preprocessing.text import Tokenizer
from keras import regularizers

## lift curve
def mylift(y,p):
   """lift"""
   ii = np.argsort(p)[::-1]
   ps = np.cumsum(y[ii])/np.sum(y)
   return(ps)

##################################################
## data

## Each obseration is a movie review
## y: binary, favorable or not
## x: text of moview review as represented by binary indictors of whether
##      a given term is in the review or not

np.random.seed(0)

#load data
nf=1000 # number of features = number of terms counted in each revies
(xtr,ytr), (xte,yte) = imdb.load_data(num_words=nf)

#create binary features
tkn = Tokenizer(num_words=nf)
Xtr = tkn.sequences_to_matrix(xtr,mode="binary")
Xte = tkn.sequences_to_matrix(xte,mode="binary")

## quick look at data
print("shape of x train: ",Xtr.shape)
print("shape of x test: ",Xte.shape)
print("shape of y train: ",ytr.shape)
print("shape of y test: ",yte.shape)
ntr = len(ytr)
nte = len(yte)

#distribution of ytr
pd.Series(ytr).value_counts()/ntr
#distribution of yte
pd.Series(yte).value_counts()/ntr

#X is all 0's and 1's
Xtr[0:20,0:5]
pd.Series(Xtr[:,4]).value_counts()
pd.Series(Xtr.reshape((nf*ntr,))).value_counts()

## since all the x's are binary, we do not have to standardise !!!

##################################################
### fit neural net model

## make model in keras, two hidden layers, each with 16 units, relu activation.
## final output layer is sigmoid to get a logit like fit.
#nn
l2p=.1 #L2 penalty
nmod = models.Sequential()
nmod.add(layers.Dense(units=16,activation="relu",kernel_regularizer=regularizers.l2(l2p),input_shape=(nf,)))
nmod.add(layers.Dense(units=16,activation="relu",kernel_regularizer=regularizers.l2(l2p)))
nmod.add(layers.Dense(units=1,activation="sigmoid"))

#compile
# binary_crossentropy is similar to deviance loss, will also keep track of acc=%correct
nmod.compile(loss="binary_crossentropy",optimizer="rmsprop",metrics=["accuracy"])

#fit
nhist = nmod.fit(Xtr,ytr,epochs=50,verbose=1,batch_size=100,validation_data=(Xte,yte))

##################################################
#plot loss versus epoch number
trL = nhist.history["loss"]
teL = nhist.history["val_loss"]

epind = range(1,len(teL)+1)
plt.plot(epind,trL,"r--")
plt.plot(epind,teL,"b-")
plt.legend(["train loss","test loss"])
plt.xlabel("Epoch"); plt.ylabel("Loss")
plt.show()

#lift on test
phatte = nmod.predict(Xte)[:,0]
nte = len(yte)
pvec = np.linspace(1,nte,nte)/nte
plt.scatter(pvec,mylift(yte,phatte),s=.5,c='blue')
plt.xlabel('percent sample'); plt.ylabel('percent y=1')
plt.plot(pvec,pvec,c='red')
plt.show()

#lift on train
phattr = nmod.predict(Xtr)[:,0]
ntr = len(ytr)
pvec = np.linspace(1,ntr,ntr)/ntr
plt.scatter(pvec,mylift(ytr,phattr),s=.5,c='blue')
plt.xlabel('percent sample'); plt.ylabel('percent y=1')
plt.plot(pvec,pvec,c='red')
plt.show()