##################################################
##################################################
### example from section 3.4 of "Deep Learning with Python" by Chollet.
### section 4.1 in second edition

### classify movie reviews as good or bad based on the text

##################################################
## imports

##basic
import numpy as np
import pandas as pd
import math

## graphics
import matplotlib.pyplot as plt
plt.style.use('seaborn')
%matplotlib

## sklearn feature scaling
from sklearn.preprocessing import StandardScaler

## keras
from tensorflow import keras

## metrics from sklearn
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

##roc/auc
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

## lift curve
def mylift(y,p):
   """lift
      y: binary 0 or 1
      p: prob of a 1 """
   ii = np.argsort(p)[::-1]
   ps = np.cumsum(y[ii])/np.sum(y)
   return(ps)


##################################################
### load data pg 68, page 97 second edition

## the argument num_words means only use the top 10,000 most frequently occurring words
(train_data, train_labels), (test_data, test_labels) = keras.datasets.imdb.load_data(num_words=10000)
print(f'number of train observations from labels is {len(train_labels)}')
print(f'number of test observations from labels is {len(test_labels)}')
print(f'number of train observations from features is {len(train_data)}')
print(f'number of test observations from features is {len(test_data)}')

## data is a ndarray of lists, each list as the ids for words in the document.
##  see above note about the warning.
print(type(train_data)) #ndarray
print(type(train_data[0])) #each element of the array is a list
print(len(train_data[0])) #the list gives the word ids of each word in the document (bag of words)
print(len(train_data[1]))

## ids of first 5 words in first train obs
print(train_data[0][:5])

## check range of word ids in train features
MM = max([max(sequence) for sequence in train_data])
LL = min([min(sequence) for sequence in train_data])
print(f'train ids range from {LL} to {MM}')

## labels are 0,1 binary, 1 is positive
train_labels.dtype

## tables for binary y, train and test, this data is cooked so that the 0's and 1's are balanced.
## 0 is for a bad review and 1 for a good review
print(pd.Series(train_labels).value_counts())
print(pd.Series(test_labels).value_counts())


##################################################
### pg 69 (98 second ed) , for fun get word names

## word_index is a dict with keys = text of words and values = integer indices
word_index = keras.datasets.imdb.get_word_index()
print(word_index['film'])

# flip key and value(key, value) -> (value,key)
reverse_word_index = dict([(value,key) for (key,value) in word_index.items()])

## first words
for i in range(1,21):
   print(i)
   print(reverse_word_index[i])

## randomly sampled words
nw = len(word_index)
np.random.seed(34) 
nd = 10
indices = np.random.choice(nw,nd,replace=False)
for i in indices:
   print((i,reverse_word_index[i]))

rid = 10 # review id to translate to words
decoded_review = ' '.join([reverse_word_index.get(i-3,'?') for i in train_data[rid]])
print(decoded_review)

## just print out the first nw words from the review
rid = 10 # review id to translate to words
nw = 15
for i in range(1,nw):
   wid = train_data[rid][i]
   print(wid,reverse_word_index[wid-3]) #offset of 3

##################################################
### preparing the data

def vseq(seq,dim=10000):
   res = np.zeros((len(seq),dim))
   for i, s in enumerate(seq):
      res[i,s] = 1
   return res

xtr = vseq(train_data)
xte = vseq(test_data)

## check
rid =  0
print(f'num = 1 in rev {rid} is {xtr[rid].sum()}')
print(f'number unique id in rev is {len(pd.Series(train_data[rid]).unique())}')

## labels as float32 arrays
## (2ed pg 99) " you should also vectorize your labels, which is strahtforward"
ytr = np.asarray(train_labels).astype('float32')
yte = np.asarray(test_labels).astype('float32')
print(pd.Series(ytr).value_counts())
print(pd.Series(yte).value_counts())

##################################################
## pg 70 Building your network (2nd ed pg 99)

# have to choose number of layers: 2
# have to choose number of unit in each layer: (16,16)

model = keras.models.Sequential()

model.add(keras.layers.Dense(16,activation='relu',input_shape=(xtr.shape[1],)))
model.add(keras.layers.Dense(16,activation='relu'))
model.add(keras.layers.Dense(1,activation='sigmoid'))

## (2nd ed pg 102) have to choose
## optimizer, loss, metrics to monitor as well as loss
## 2nd ed, pg 106, section 4.1.7
##   "The rmsprop is generally a good enough choice, whatever your problem.
##      That's one less thing for you to worry about."
##                 hmmmmmm.
model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['accuracy'])

## split train into "partial train" and val

nval = 10000

xval = xtr[:nval]
yval = ytr[:nval]

pxtr = xtr[nval:]
pytr = ytr[nval:]

# train and store history in trh 
## have to choose number of epochs and batch size
trh = model.fit(pxtr,pytr,epochs=20,batch_size=512,validation_data=(xval,yval))

##################################################
### plot history pg 74

hdict = trh.history
type(hdict)
hdict.keys()

## loss
trL = hdict['loss']
teL = hdict['val_loss']
eind = range(1,len(trL)+1) 

plt.plot(eind,trL,'bo',label='Training Loss')
plt.plot(eind,teL,'r',label='Validation Loss')
plt.title('loss vs epoch')
plt.xlabel('epoch'); plt.ylabel('loss')
plt.legend(loc=9)
fig = plt.gcf()
fig.savefig('training_imdb.pdf')

## accuracy
trA = hdict['accuracy']
teA = hdict['val_accuracy']

plt.plot(eind,trA,'bo',label='Training Accuracy')
plt.plot(eind,teA,'r',label='Validation Accuracy')
plt.title('accuracy vs epoch')
plt.xlabel('epoch'); plt.ylabel('accuracy')
plt.legend()
fig = plt.gcf()
fig.savefig('training_imdb_acc.pdf')

##################################################
## refit on train using just 4 epochs, predict on test

mod = keras.models.Sequential()

## first edition
## mod.add(keras.layers.Dense(16,activation='relu',input_shape=(xtr.shape[1],)))
## mod.add(keras.layers.Dense(16,activation='relu'))
## mod.add(keras.layers.Dense(1,activation='sigmoid'))

## second edition
mod = keras.Sequential([
   keras.layers.Dense(16,activation='relu'), 
   keras.layers.Dense(16,activation='relu'), 
   keras.layers.Dense(1,activation='sigmoid')
])

mod.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['accuracy']) 


# train on all the train , early stopping regularization
nepoch = 4
mod.fit(xtr,ytr,epochs=nepoch,batch_size=512)
# not evaluate on test data
res = mod.evaluate(xte,yte)
print(res)

##################################################
### predict

phat = mod.predict(xte)
print(type(phat))
print(phat.shape)
phat = phat.squeeze()
print(phat.shape)

nte = len(yte)
ypred = np.zeros((nte,))
ypred[phat>.5]=1
cTab = pd.crosstab(yte,ypred)

##check test accuracy
print(np.diag(cTab.to_numpy()).sum()/nte)
print(cTab)


## plot lift curve
plt.plot((np.arange(nte)+1)/nte,mylift(yte,phat))
plt.title('out-of-sample lift curve')
plt.plot((np.arange(nte)+1)/nte,(np.arange(nte)+1)/nte,c='red',linestyle='dotted')
plt.xlabel('percent test data'); plt.ylabel('percent test 1 found')
fig = plt.gcf()
fig.savefig("test-lift_imbd.pdf")

##auc
aucnn = roc_auc_score(yte,phat)
print('auc for neural net: ',aucnn)

##roc
rocnn = roc_curve(yte,phat)
plt.plot(rocnn[0],rocnn[1])
plt.xlabel('false positive rate'); plt.ylabel('true postive rate')
plt.title('neural net auc ' + str(np.round(aucnn,2)))
fig = plt.gcf()
fig.savefig("test-roc_imbd.pdf")