Sentiment Analysis with Deep Learning#
Loading Packages#
%%time
import gensim
import keras
from keras.models import Sequential
from keras.layers import Dropout, Activation, Dense
from sklearn.preprocessing import LabelEncoder
from keras.layers.normalization import BatchNormalization
Preparing Data#
## Data Import and Preprocessing
import pandas as pd
import numpy as np
#import text_normalizer as tn
#import model_evaluation_utils as meu
import nltk
np.set_printoptions(precision=2, linewidth=80)
dataset = pd.read_csv('../data/movie_reviews.csv')
# take a peek at the data
print(dataset.head())
reviews = np.array(dataset['review'])
sentiments = np.array(dataset['sentiment'])
type(reviews)
reviews.shape
sentiments.shape
# build train and test datasets
train_reviews = reviews[:35000]
train_sentiments = sentiments[:35000]
test_reviews = reviews[35000:]
test_sentiments = sentiments[35000:]
## Processing is ignored
norm_train_reviews = train_reviews
norm_test_reviews = test_reviews
review sentiment
0 One of the other reviewers has mentioned that ... positive
1 A wonderful little production. <br /><br />The... positive
2 I thought this was a wonderful way to spend ti... positive
3 Basically there's a family where a little boy ... negative
4 Petter Mattei's "Love in the Time of Money" is... positive
from nltk.tokenize.toktok import ToktokTokenizer
tokenizer = ToktokTokenizer()
le = LabelEncoder()
num_classes=2
# tokenize train reviews & encode train labels
tokenized_train = [tokenizer.tokenize(text)
for text in norm_train_reviews]
y_tr = le.fit_transform(train_sentiments)
y_train = keras.utils.to_categorical(y_tr, num_classes)
# tokenize test reviews & encode test labels
tokenized_test = [tokenizer.tokenize(text)
for text in norm_test_reviews]
y_ts = le.fit_transform(test_sentiments)
y_test = keras.utils.to_categorical(y_ts, num_classes)
# print class label encoding map and encoded labels
print('Sentiment class label map:', dict(zip(le.classes_, le.transform(le.classes_))))
print('Sample test label transformation:\n'+'-'*35,
'\nActual Labels:', test_sentiments[:3], '\nEncoded Labels:', y_ts[:3],
'\nOne hot encoded Labels:\n', y_test[:3])
Sentiment class label map: {'negative': 0, 'positive': 1}
Sample test label transformation:
-----------------------------------
Actual Labels: ['negative' 'positive' 'negative']
Encoded Labels: [0 1 0]
One hot encoded Labels:
[[1. 0.]
[0. 1.]
[1. 0.]]
Training Word Embeddings#
%%time
# build word2vec model
w2v_num_features = 512
w2v_model = gensim.models.Word2Vec(tokenized_train,
size=w2v_num_features, window=150,
min_count=10, sample=1e-3, workers=16)
## takes 5mins
CPU times: user 18min 20s, sys: 5.03 s, total: 18min 25s
Wall time: 5min 12s
## This model uses the document word vector averaging scheme
## Use the average word vector representations to represent one document (movie reivew)
def averaged_word2vec_vectorizer(corpus, model, num_features):
vocabulary = set(model.wv.index2word)
def average_word_vectors(words, model, vocabulary, num_features):
feature_vector = np.zeros((num_features,), dtype="float64")
nwords = 0.
for word in words:
if word in vocabulary:
nwords = nwords + 1.
feature_vector = np.add(feature_vector, model.wv[word])
if nwords:
feature_vector = np.divide(feature_vector, nwords)
return feature_vector
features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
for tokenized_sentence in corpus]
return np.array(features)
# generate averaged word vector features from word2vec model
avg_wv_train_features = averaged_word2vec_vectorizer(corpus=tokenized_train, model=w2v_model,
num_features=w2v_num_features)
avg_wv_test_features = averaged_word2vec_vectorizer(corpus=tokenized_test, model=w2v_model,
num_features=w2v_num_features)
Loading Pre-trained Word Embeddings#
# %%time
# # Use the 300-dimensional word vectors trained on the Common Crawl using the GloVe model
# # Provided by spaCy
# import spacy
# #nlp = spacy.load('en', parse=False, tag=False, entity=False)
# nlp_vec = spacy.load('en_vectors_web_lg', parse=False, tag=False, entity=False)
# ## feature engineering with GloVe model
# train_nlp = [nlp_vec(item) for item in norm_train_reviews]
# train_glove_features = np.array([item.vector for item in train_nlp])
# test_nlp = [nlp_vec(item) for item in norm_test_reviews]
# test_glove_features = np.array([item.vector for item in test_nlp])
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<timed exec> in <module>
<timed exec> in <listcomp>(.0)
NameError: name 'tn' is not defined
# print('Word2Vec model:> Train features shape:', avg_wv_train_features.shape, ' Test features shape:', avg_wv_test_features.shape)
# print('GloVe model:> Train features shape:', train_glove_features.shape, ' Test features shape:', test_glove_features.shape)
Building Model#
A simple fully-connected 4 layer deep neural network
input layer (not counted as one layer), i.e., the word embedding layer
three dense hidden layers (with 512 neurons)
one output layer (with 2 neurons for classification)
(aka. multi-layered perceptron or deep ANN)
def construct_deepnn_architecture(num_input_features):
dnn_model = Sequential()
dnn_model.add(Dense(512, input_shape=(num_input_features,), kernel_initializer='glorot_uniform'))
dnn_model.add(BatchNormalization()) # improve stability of the network.
dnn_model.add(Activation('relu')) # relu better than sigmoid, to present vanishing gradient problem
dnn_model.add(Dropout(0.2)) # prevents overfitting
dnn_model.add(Dense(512, kernel_initializer='glorot_uniform'))
dnn_model.add(BatchNormalization())
dnn_model.add(Activation('relu'))
dnn_model.add(Dropout(0.2))
dnn_model.add(Dense(512, kernel_initializer='glorot_uniform'))
dnn_model.add(BatchNormalization())
dnn_model.add(Activation('relu'))
dnn_model.add(Dropout(0.2))
dnn_model.add(Dense(2))
dnn_model.add(Activation('softmax'))
dnn_model.compile(loss='categorical_crossentropy', optimizer='adam',
metrics=['accuracy'])
return dnn_model
w2v_dnn = construct_deepnn_architecture(num_input_features=w2v_num_features)
Model Visualization#
To make this work, install
pip3 install pydot
and also install
!brew install graphviz
in terminal for macthat is, install graphvis
## Not working yet. Had a problem with the installation of graphviz on mac
# from IPython.display import SVG
# from keras.utils.vis_utils import model_to_dot
# SVG(model_to_dot(w2v_dnn, show_shapes=True, show_layer_names=False,
# rankdir='TB').create(prog='dot', format='svg'))
Model Fitting#
Fitting using self-trained word embeddings#
batch_size = 100
w2v_dnn.fit(avg_wv_train_features, y_train, epochs=10, batch_size=batch_size,
shuffle=True, validation_split=0.1, verbose=1)
Epoch 1/10
315/315 [==============================] - 3s 10ms/step - loss: 0.3763 - accuracy: 0.8397 - val_loss: 0.3127 - val_accuracy: 0.8640
Epoch 2/10
315/315 [==============================] - 3s 9ms/step - loss: 0.3054 - accuracy: 0.8719 - val_loss: 0.3108 - val_accuracy: 0.8757
Epoch 3/10
315/315 [==============================] - 3s 9ms/step - loss: 0.2948 - accuracy: 0.8776 - val_loss: 0.3133 - val_accuracy: 0.8651
Epoch 4/10
315/315 [==============================] - 3s 9ms/step - loss: 0.2837 - accuracy: 0.8811 - val_loss: 0.3099 - val_accuracy: 0.8706
Epoch 5/10
315/315 [==============================] - 3s 10ms/step - loss: 0.2747 - accuracy: 0.8857 - val_loss: 0.3048 - val_accuracy: 0.8763
Epoch 6/10
315/315 [==============================] - 3s 10ms/step - loss: 0.2712 - accuracy: 0.8858 - val_loss: 0.3337 - val_accuracy: 0.8606
Epoch 7/10
315/315 [==============================] - 3s 10ms/step - loss: 0.2624 - accuracy: 0.8906 - val_loss: 0.3099 - val_accuracy: 0.8737
Epoch 8/10
315/315 [==============================] - 3s 10ms/step - loss: 0.2583 - accuracy: 0.8914 - val_loss: 0.3321 - val_accuracy: 0.8700
Epoch 9/10
315/315 [==============================] - 3s 10ms/step - loss: 0.2495 - accuracy: 0.8949 - val_loss: 0.3111 - val_accuracy: 0.8711
Epoch 10/10
315/315 [==============================] - 3s 9ms/step - loss: 0.2397 - accuracy: 0.9003 - val_loss: 0.3370 - val_accuracy: 0.8734
<tensorflow.python.keras.callbacks.History at 0x7fae04772a90>
y_pred = w2v_dnn.predict_classes(avg_wv_test_features)
predictions = le.inverse_transform(y_pred)
WARNING:tensorflow:From <ipython-input-13-bf19a67cc778>:1: Sequential.predict_classes (from tensorflow.python.keras.engine.sequential) is deprecated and will be removed after 2021-01-01.
Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`, if your model does multi-class classification (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`, if your model does binary classification (e.g. if it uses a `sigmoid` last-layer activation).
# functions from Text Analytics with Python book
def get_metrics(true_labels, predicted_labels):
print('Accuracy:', np.round(
metrics.accuracy_score(true_labels,
predicted_labels),
4))
print('Precision:', np.round(
metrics.precision_score(true_labels,
predicted_labels,
average='weighted'),
4))
print('Recall:', np.round(
metrics.recall_score(true_labels,
predicted_labels,
average='weighted'),
4))
print('F1 Score:', np.round(
metrics.f1_score(true_labels,
predicted_labels,
average='weighted'),
4))
def display_confusion_matrix(true_labels, predicted_labels, classes=[1,0]):
total_classes = len(classes)
level_labels = [total_classes*[0], list(range(total_classes))]
cm = metrics.confusion_matrix(y_true=true_labels, y_pred=predicted_labels,
labels=classes)
cm_frame = pd.DataFrame(data=cm,
columns=pd.MultiIndex(levels=[['Predicted:'], classes],
codes=level_labels),
index=pd.MultiIndex(levels=[['Actual:'], classes],
codes=level_labels))
print(cm_frame)
def display_classification_report(true_labels, predicted_labels, classes=[1,0]):
report = metrics.classification_report(y_true=true_labels,
y_pred=predicted_labels,
labels=classes)
print(report)
def display_model_performance_metrics(true_labels, predicted_labels, classes=[1,0]):
print('Model Performance metrics:')
print('-'*30)
get_metrics(true_labels=true_labels, predicted_labels=predicted_labels)
print('\nModel Classification report:')
print('-'*30)
display_classification_report(true_labels=true_labels, predicted_labels=predicted_labels,
classes=classes)
print('\nPrediction Confusion Matrix:')
print('-'*30)
display_confusion_matrix(true_labels=true_labels, predicted_labels=predicted_labels,
classes=classes)
from sklearn import metrics
display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=predictions,
classes=['positive', 'negative'])
Model Performance metrics:
------------------------------
Accuracy: 0.8717
Precision: 0.8719
Recall: 0.8717
F1 Score: 0.8717
Model Classification report:
------------------------------
precision recall f1-score support
positive 0.86 0.88 0.87 7510
negative 0.88 0.86 0.87 7490
accuracy 0.87 15000
macro avg 0.87 0.87 0.87 15000
weighted avg 0.87 0.87 0.87 15000
Prediction Confusion Matrix:
------------------------------
Predicted:
positive negative
Actual: positive 6628 882
negative 1042 6448
Fitting using pre-trained word embedding model#
# glove_dnn = construct_deepnn_architecture(num_input_features=300)
# batch_size = 100
# glove_dnn.fit(train_glove_features, y_train, epochs=10, batch_size=batch_size,
# shuffle=True, validation_split=0.1, verbose=1)
# y_pred = glove_dnn.predict_classes(test_glove_features)
# predictions = le.inverse_transform(y_pred)
# meu.display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=predictions,
# classes=['positive', 'negative'])