Sequence Model with Attention for Addition Learning#

  • In this unit, we will practice on the sequence-to-sequence model using a naive example of numbers addition.

  • The inputs are sequences of two numbers adding together (e.g., 123+23); the outputs are the correct answers, i.e., the sum of the two numbers (i.e., 125).

  • This type of sequence model is also referred to as Encoder-Decoder Models.

  • This task is to simulate the machine translation task (i.e, the sequence to the left of the equation is the source language while the sequence to the right of the equation is the target language).

  • In particular, we will implement not only a vanilla RNN-based sequence-to-sequence model but also a few extended variants of the RNN, including:

    • GRU

    • Bidirectional GRU

    • Peeky Decoder

    • Attention-based Decoder

Set up Dependencies#

import re
import tensorflow
import numpy as np
from random import randint

from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import GRU
from tensorflow.keras.layers import SimpleRNN
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import Concatenate
from tensorflow.keras.layers import TimeDistributed
from tensorflow.keras.layers import RepeatVector
from tensorflow.keras.layers import Attention
from tensorflow.keras.layers import AdditiveAttention
from tensorflow.keras.layers import GlobalAveragePooling1D
from tensorflow.keras import Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.utils import plot_model
print('Tensorflow Version: ', tensorflow.__version__)
Tensorflow Version:  2.4.1

Deep Learning Hyperparameters#

batch_size = 128  # Batch size for training.
epochs = 30  # Epochs for training
latent_dim = 256  # Encoder-Decoder latent dimensions

Data#

  • Please download the data set from demo_data/addition-student-version.csv, where each line is a training sample, consisting of the input sequence (e.g., 16+75) and the target sequence (e.g., 91) separated by a comma.

  • We load the data and add initial and ending token to all the target sequences (_).

data_path = '../../../RepositoryData/data/deep-learning-2/addition-student-version.csv'

with open(data_path, 'r', encoding='utf-8') as f:
    lines = f.read().split('\n')

lines = [l for l in lines if l!='']
    
input_texts = [l.split(',')[0] for l in lines]
target_texts = [l.split(',')[-1].strip() for l in lines]

target_texts = ['_' + sent + '_' for sent in target_texts]

np.random.seed(123)

inds = np.arange(len(input_texts))
np.random.shuffle(inds)

print(input_texts[:5])
print(target_texts[:5])
print('Data Size:', len(input_texts))
['16+75', '52+607', '75+22', '63+22', '795+3']
['_91_', '_659_', '_97_', '_85_', '_798_']
Data Size: 50000

Train-Test Split#

train_test_ratio = 0.9
train_size = int(round(len(lines) * train_test_ratio))
train_inds = inds[:train_size]
test_inds = inds[train_size:]

tr_input_texts = [input_texts[ti] for ti in train_inds]
tr_target_texts = [target_texts[ti] for ti in train_inds]

ts_input_texts = [input_texts[ti] for ti in test_inds]
ts_target_texts = [target_texts[ti] for ti in test_inds]
tr_input_texts[:10]
['27+673',
 '153+27',
 '93+901',
 '243+678',
 '269+46',
 '235+891',
 '46+290',
 '324+947',
 '721+49',
 '535+7']
tr_target_texts[:10]
['_700_',
 '_180_',
 '_994_',
 '_921_',
 '_315_',
 '_1126_',
 '_336_',
 '_1271_',
 '_770_',
 '_542_']
print('Number of Samples:', len(lines))
print('Number of Samples in Training:', len(tr_input_texts))
print('Number of Samples in Testing:', len(ts_input_texts))
Number of Samples: 50000
Number of Samples in Training: 45000
Number of Samples in Testing: 5000

Data Preprocessing#

Text to Sequences#

  • Tokenization of input and target texts invovles the following important steps:

    • Create a Tokenizer

    • Fit the Tokenizer on the training sets

    • Tokenize input and target texts of the training set into sequences

    • Identify the maxlen of the input and target sequences

    • Pad input and target sequences to uniform lengths

  • Note that we need to create a character-based Tokenizer.

  • There will be two Tokenizers, one for input texts and the other for target texts.

# """ Defining tokenizers """
input_tokenizer = Tokenizer(char_level=True)
input_tokenizer.fit_on_texts(tr_input_texts)
encoder_input_sequences = input_tokenizer.texts_to_sequences(tr_input_texts)
input_maxlen = np.max([len(l) for l in encoder_input_sequences])
encoder_input_sequences = pad_sequences(encoder_input_sequences,
                                        padding='post',
                                        maxlen=input_maxlen)

target_tokenizer = Tokenizer(char_level=True)
target_tokenizer.fit_on_texts(tr_target_texts)
target_sequences = target_tokenizer.texts_to_sequences(tr_target_texts)
target_maxlen = np.max([len(l) for l in target_sequences])
target_sequences = pad_sequences(target_sequences,
                                 padding='post',
                                 maxlen=target_maxlen)
# Shapes of Input and Target Sequences
print(encoder_input_sequences.shape)
print(target_sequences.shape)
(45000, 7)
(45000, 6)
# ### vocab size
input_vsize = max(input_tokenizer.index_word.keys()) + 1
target_vsize = max(target_tokenizer.index_word.keys()) + 1

Note

The plus 1 for vocabulary size is to include the padding character, whose index is the reserved 0.

print(input_vsize)
print(target_vsize)
12
12
print(tr_input_texts[:3])
print(encoder_input_sequences[:3])
['27+673', '153+27', '93+901']
[[ 9 10  1  7 10  5  0]
 [ 8  6  5  1  9 10  0]
 [ 2  5  1  2 11  8  0]]
input_tokenizer.word_index
{'+': 1,
 '9': 2,
 '4': 3,
 '8': 4,
 '3': 5,
 '5': 6,
 '6': 7,
 '1': 8,
 '2': 9,
 '7': 10,
 '0': 11}
print(tr_target_texts[:3])
print(target_sequences[:3])
['_700_', '_180_', '_994_']
[[ 1  4 11 11  1  0]
 [ 1  2 10 11  1  0]
 [ 1  7  7  8  1  0]]
target_tokenizer.word_index
{'_': 1,
 '1': 2,
 '2': 3,
 '7': 4,
 '6': 5,
 '3': 6,
 '9': 7,
 '4': 8,
 '5': 9,
 '8': 10,
 '0': 11}

Special Considerations for Decoder’s Input and Output#

  • In the training stage, we give the Decoder the correct target sequences for teacher forcing.

  • Input and Output Sequences for Decoder

    • Decoder input and output sequences have one time-step difference (i.e., the decoder’s output at \(t-1\) is the decoder’s input at \(t\))

    • We create decoder input and output sequences as different sets of data.

decoder_input_sequences = target_sequences[:,:-1]
decoder_output_sequences = target_sequences[:,1:]
print(decoder_input_sequences[:5])
print(decoder_output_sequences[:5])
[[ 1  4 11 11  1]
 [ 1  2 10 11  1]
 [ 1  7  7  8  1]
 [ 1  7  3  2  1]
 [ 1  6  2  9  1]]
[[ 4 11 11  1  0]
 [ 2 10 11  1  0]
 [ 7  7  8  1  0]
 [ 7  3  2  1  0]
 [ 6  2  9  1  0]]

Sequences to One-Hot Encoding#

  • To simplify the matter, we convert each sequence/integer token into one-hot encoding, which will be the input of the Encoder directly.

  • Normally we would add an Embedding layer to convert sequence tokens to embeddings before sending them to the Encoder.

  • Please note that this step renders the text representation of the entire training data from 2D (batch_size, max_length) to 3D tensors (batch_size, max_length, vocab_size).

Note

For neural machine translations, the vocabulary sizes of the input and target languages are usually very large. It is more effective to implement an Embedding layer to convert sequences (integers) into embeddings, rather than one-hot encodings.

For this tutorial, we have a limited vocabulary size (only digits and math symbols). One-hot encodings should be fine.

However, in the assignment, you will practice on how to add embedding layers for both Encoder and Decoder.

print(encoder_input_sequences.shape)
print(decoder_input_sequences.shape)
print(decoder_output_sequences.shape)
(45000, 7)
(45000, 5)
(45000, 5)
encoder_input_onehot = to_categorical(encoder_input_sequences,
                                      num_classes=input_vsize)
decoder_input_onehot = to_categorical(decoder_input_sequences,
                                      num_classes=target_vsize)
decoder_output_onehot = to_categorical(decoder_output_sequences,
                                       num_classes=target_vsize)
print(encoder_input_onehot.shape)
print(decoder_input_onehot.shape)
print(decoder_output_onehot.shape)
(45000, 7, 12)
(45000, 5, 12)
(45000, 5, 12)

Token Indices#

  • We create the integer-to-character dictionaries for later use.

  • Two dictionaries, one for the input sequence and one for the target sequence.

""" Index2word """
enc_index2word = dict(
    zip(input_tokenizer.word_index.values(),
        input_tokenizer.word_index.keys()))
dec_index2word = dict(
    zip(target_tokenizer.word_index.values(),
        target_tokenizer.word_index.keys()))
enc_index2word
{1: '+',
 2: '9',
 3: '4',
 4: '8',
 5: '3',
 6: '5',
 7: '6',
 8: '1',
 9: '2',
 10: '7',
 11: '0'}
dec_index2word
{1: '_',
 2: '1',
 3: '2',
 4: '7',
 5: '6',
 6: '3',
 7: '9',
 8: '4',
 9: '5',
 10: '8',
 11: '0'}
import matplotlib.pyplot as plt
import matplotlib
import pandas as pd
matplotlib.rcParams['figure.dpi'] = 150


# Plotting results
def plot1(history):

    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']

    epochs = range(1, len(acc) + 1)
    ## Accuracy plot
    plt.plot(epochs, acc, 'bo', label='Training acc')
    plt.plot(epochs, val_acc, 'b', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    ## Loss plot
    plt.figure()

    plt.plot(epochs, loss, 'bo', label='Training loss')
    plt.plot(epochs, val_loss, 'b', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()
    plt.show()


def plot2(history):
    pd.DataFrame(history.history).plot(figsize=(8, 5))
    plt.grid(True)
    #plt.gca().set_ylim(0,1)
    plt.show()

Model Training#

  • Define the model architecture

  • Train the model

  • Sequence-to-Sequence can go from simple RNNs to complex models with attention mechanisms.

  • In this tutorial, we will try the following:

    • Sequence-to-sequence model with vanilla RNN Encoder and Decoder

    • Sequence-to-sequence model with GRU Encoder and Decoder

    • Sequence-to-sequence model with bidirectional RNN Encoder

    • Sequence-to-sequence model with peeky Decoder

    • Sequence-to-sequence model with attention-based Decoder

  • Sequential vs. Functional API in keras

    • We have been using the Sequential API to create the network models, where each layer’s output is the input of the subsequent layer.

    • However, for Encoder-Decoder Models, sometimes not all the outputs of the previous layer are the inputs of the subsequent layer.

    • We need more flexibility in the ways of connecting the inputs and outputs of the model layers.

    • Therefore, here we will use the Functional API for model definition.

Model 1 (Vanilla RNN)#

Define Model#

  • Important Highlights:

    • In the training stage, we feed the decoder the correct answer at each time step as the input sequence.

    • In the testing stage, the decoder will take the hidden state from the previous time step as the input sequence.

    • This type of training is referred to as teacher forcing learning strategy. This can help the model converge more effectively.

    • The decoder uses encoder’s last hidden state as the initial hidden state.

# Define Model Inputs
encoder_inputs = Input(shape=(input_maxlen, input_vsize),
                       name='encoder_inputs')
decoder_inputs = Input(shape=(target_maxlen - 1, target_vsize),
                       name='decoder_inputs')

# Encoder RNN
    ## first return is the hidden states of all timesteps of encoder
    ## second return is the last hidden state of encoder
encoder_rnn = SimpleRNN(latent_dim,
                        return_sequences=True,
                        return_state=True,
                        name='encoder_rnn')
_, encoder_state = encoder_rnn(encoder_inputs)


# Decoder RNN
    ## using `encoder_state` (last h) as initial state.
    ## using `decoder_inputs` for teacher forcing learning
decoder_rnn = SimpleRNN(latent_dim,
                        return_sequences=True,
                        return_state=True,
                        name='decoder_rnn')
decoder_out, _ = decoder_rnn(decoder_inputs, initial_state=encoder_state)

# Dense layer
dense = Dense(target_vsize, activation='softmax', name='softmax_layer')
dense_time = TimeDistributed(dense, name='time_distributed_layer')
decoder_pred = dense_time(decoder_out)

# Full model
full_model1 = Model(inputs=[encoder_inputs, decoder_inputs],
                    outputs=decoder_pred)
full_model1.summary()
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
==================================================================================================
encoder_inputs (InputLayer)     [(None, 7, 12)]      0                                            
__________________________________________________________________________________________________
decoder_inputs (InputLayer)     [(None, 5, 12)]      0                                            
__________________________________________________________________________________________________
encoder_rnn (SimpleRNN)         [(None, 7, 256), (No 68864       encoder_inputs[0][0]             
__________________________________________________________________________________________________
decoder_rnn (SimpleRNN)         [(None, 5, 256), (No 68864       decoder_inputs[0][0]             
                                                                 encoder_rnn[0][1]                
__________________________________________________________________________________________________
time_distributed_layer (TimeDis (None, 5, 12)        3084        decoder_rnn[0][0]                
==================================================================================================
Total params: 140,812
Trainable params: 140,812
Non-trainable params: 0
__________________________________________________________________________________________________
plot_model(full_model1, show_shapes=True)
../_images/82c2f9f865f79006f50dd160a57edcd6814dca022566c3700398f58e20174586.png

Training#

# Run training
full_model1.compile(optimizer='adam',
                    loss='categorical_crossentropy',
                    metrics=['accuracy'])
history1 = full_model1.fit([encoder_input_onehot, decoder_input_onehot],
                           decoder_output_onehot,
                           batch_size=batch_size,
                           epochs=epochs,
                           validation_split=0.2)
Hide code cell output
Epoch 1/30
282/282 [==============================] - 6s 18ms/step - loss: 1.4381 - accuracy: 0.4772 - val_loss: 1.2027 - val_accuracy: 0.5463
Epoch 2/30
282/282 [==============================] - 4s 14ms/step - loss: 1.1515 - accuracy: 0.5703 - val_loss: 1.0187 - val_accuracy: 0.6145
Epoch 3/30
282/282 [==============================] - 4s 13ms/step - loss: 0.9641 - accuracy: 0.6353 - val_loss: 0.8675 - val_accuracy: 0.6631
Epoch 4/30
282/282 [==============================] - 4s 13ms/step - loss: 0.7778 - accuracy: 0.7006 - val_loss: 0.6676 - val_accuracy: 0.7382
Epoch 5/30
282/282 [==============================] - 4s 13ms/step - loss: 0.6133 - accuracy: 0.7656 - val_loss: 0.5297 - val_accuracy: 0.7950
Epoch 6/30
282/282 [==============================] - 4s 13ms/step - loss: 0.4924 - accuracy: 0.8125 - val_loss: 0.4788 - val_accuracy: 0.8073
Epoch 7/30
282/282 [==============================] - 4s 13ms/step - loss: 0.4017 - accuracy: 0.8469 - val_loss: 0.3664 - val_accuracy: 0.8606
Epoch 8/30
282/282 [==============================] - 4s 13ms/step - loss: 0.3236 - accuracy: 0.8785 - val_loss: 0.3241 - val_accuracy: 0.8713
Epoch 9/30
282/282 [==============================] - 4s 13ms/step - loss: 0.2702 - accuracy: 0.9006 - val_loss: 0.2862 - val_accuracy: 0.8865
Epoch 10/30
282/282 [==============================] - 4s 13ms/step - loss: 0.2376 - accuracy: 0.9129 - val_loss: 0.2563 - val_accuracy: 0.8998
Epoch 11/30
282/282 [==============================] - 4s 13ms/step - loss: 0.2150 - accuracy: 0.9216 - val_loss: 0.2413 - val_accuracy: 0.9053
Epoch 12/30
282/282 [==============================] - 4s 14ms/step - loss: 0.1915 - accuracy: 0.9306 - val_loss: 0.2324 - val_accuracy: 0.9094
Epoch 13/30
282/282 [==============================] - 4s 14ms/step - loss: 0.1726 - accuracy: 0.9368 - val_loss: 0.2059 - val_accuracy: 0.9189
Epoch 14/30
282/282 [==============================] - 4s 13ms/step - loss: 0.1542 - accuracy: 0.9443 - val_loss: 0.1909 - val_accuracy: 0.9263
Epoch 15/30
282/282 [==============================] - 4s 13ms/step - loss: 0.1517 - accuracy: 0.9442 - val_loss: 0.1730 - val_accuracy: 0.9346
Epoch 16/30
282/282 [==============================] - 4s 14ms/step - loss: 0.1301 - accuracy: 0.9545 - val_loss: 0.1708 - val_accuracy: 0.9344
Epoch 17/30
282/282 [==============================] - 4s 14ms/step - loss: 0.1391 - accuracy: 0.9491 - val_loss: 0.1521 - val_accuracy: 0.9422
Epoch 18/30
282/282 [==============================] - 4s 14ms/step - loss: 0.1173 - accuracy: 0.9585 - val_loss: 0.1605 - val_accuracy: 0.9391
Epoch 19/30
282/282 [==============================] - 4s 13ms/step - loss: 0.1132 - accuracy: 0.9592 - val_loss: 0.1502 - val_accuracy: 0.9416
Epoch 20/30
282/282 [==============================] - 4s 13ms/step - loss: 0.0986 - accuracy: 0.9654 - val_loss: 0.1500 - val_accuracy: 0.9437
Epoch 21/30
282/282 [==============================] - 4s 14ms/step - loss: 0.1062 - accuracy: 0.9617 - val_loss: 0.2019 - val_accuracy: 0.9241
Epoch 22/30
282/282 [==============================] - 4s 13ms/step - loss: 0.1048 - accuracy: 0.9624 - val_loss: 0.1514 - val_accuracy: 0.9419
Epoch 23/30
282/282 [==============================] - 4s 14ms/step - loss: 0.0954 - accuracy: 0.9658 - val_loss: 0.1427 - val_accuracy: 0.9450
Epoch 24/30
282/282 [==============================] - 4s 14ms/step - loss: 0.0902 - accuracy: 0.9680 - val_loss: 0.1463 - val_accuracy: 0.9453
Epoch 25/30
282/282 [==============================] - 5s 16ms/step - loss: 0.1045 - accuracy: 0.9618 - val_loss: 0.1242 - val_accuracy: 0.9530
Epoch 26/30
282/282 [==============================] - 5s 17ms/step - loss: 0.0781 - accuracy: 0.9730 - val_loss: 0.1561 - val_accuracy: 0.9412
Epoch 27/30
282/282 [==============================] - 4s 14ms/step - loss: 0.0914 - accuracy: 0.9663 - val_loss: 0.1405 - val_accuracy: 0.9470
Epoch 28/30
282/282 [==============================] - 4s 14ms/step - loss: 0.0828 - accuracy: 0.9704 - val_loss: 0.1462 - val_accuracy: 0.9456
Epoch 29/30
282/282 [==============================] - 4s 14ms/step - loss: 0.0723 - accuracy: 0.9751 - val_loss: 0.1581 - val_accuracy: 0.9415
Epoch 30/30
282/282 [==============================] - 4s 14ms/step - loss: 0.0878 - accuracy: 0.9681 - val_loss: 0.1689 - val_accuracy: 0.9377

Model 2 (GRU)#

Define Model#

  • Important highlights:

    • In Model 2, we replace vanilla RNN with GRU, which deals with the issue of long-distance dependencies between sequences.

    • You can try LSTM as well.

# Define Model Inputs
encoder_inputs = Input(shape=(input_maxlen, input_vsize),
                       name='encoder_inputs')
decoder_inputs = Input(shape=(target_maxlen - 1, target_vsize),
                       name='decoder_inputs')

# Encoder GRU
    ## first return is the hidden states of all timesteps of encoder
    ## second return is the last hidden state of encoder
encoder_gru = GRU(latent_dim,
                  return_sequences=True,
                  return_state=True,
                  name='encoder_gru')
_, encoder_state = encoder_gru(encoder_inputs)

# Decoder RNN
    ## using `encoder_state` (last h) as initial state.
    ## using `decoder_inputs` for teacher forcing learning
decoder_gru = GRU(latent_dim,
                  return_sequences=True,
                  return_state=True,
                  name='decoder_gru')
decoder_out, _ = decoder_gru(decoder_inputs, initial_state=encoder_state)

# Dense layer
dense = Dense(target_vsize, activation='softmax', name='softmax_layer')
dense_time = TimeDistributed(dense, name='time_distributed_layer')
decoder_pred = dense_time(decoder_out)

# Full model
full_model2 = Model(inputs=[encoder_inputs, decoder_inputs],
                    outputs=decoder_pred)
full_model2.summary()
Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
==================================================================================================
encoder_inputs (InputLayer)     [(None, 7, 12)]      0                                            
__________________________________________________________________________________________________
decoder_inputs (InputLayer)     [(None, 5, 12)]      0                                            
__________________________________________________________________________________________________
encoder_gru (GRU)               [(None, 7, 256), (No 207360      encoder_inputs[0][0]             
__________________________________________________________________________________________________
decoder_gru (GRU)               [(None, 5, 256), (No 207360      decoder_inputs[0][0]             
                                                                 encoder_gru[0][1]                
__________________________________________________________________________________________________
time_distributed_layer (TimeDis (None, 5, 12)        3084        decoder_gru[0][0]                
==================================================================================================
Total params: 417,804
Trainable params: 417,804
Non-trainable params: 0
__________________________________________________________________________________________________
plot_model(full_model2, show_shapes=True)
../_images/7f579aea7cf54f19d47f9fdf62278664210def44847756a1252b64b18c51dc99.png

Training#

# Run training
full_model2.compile(optimizer='adam',
                    loss='categorical_crossentropy',
                    metrics=['accuracy'])
history2 = full_model2.fit([encoder_input_onehot, decoder_input_onehot],
                           decoder_output_onehot,
                           batch_size=batch_size,
                           epochs=epochs,
                           validation_split=0.2)
Hide code cell output
Epoch 1/30
282/282 [==============================] - 15s 41ms/step - loss: 1.6741 - accuracy: 0.4298 - val_loss: 1.3807 - val_accuracy: 0.4797
Epoch 2/30
282/282 [==============================] - 10s 37ms/step - loss: 1.3570 - accuracy: 0.4876 - val_loss: 1.2359 - val_accuracy: 0.5358
Epoch 3/30
282/282 [==============================] - 10s 37ms/step - loss: 1.1740 - accuracy: 0.5549 - val_loss: 1.0431 - val_accuracy: 0.6041
Epoch 4/30
282/282 [==============================] - 11s 38ms/step - loss: 1.0069 - accuracy: 0.6182 - val_loss: 0.9273 - val_accuracy: 0.6492
Epoch 5/30
282/282 [==============================] - 11s 38ms/step - loss: 0.8949 - accuracy: 0.6601 - val_loss: 0.8019 - val_accuracy: 0.6991
Epoch 6/30
282/282 [==============================] - 11s 38ms/step - loss: 0.7825 - accuracy: 0.7055 - val_loss: 0.7418 - val_accuracy: 0.7175
Epoch 7/30
282/282 [==============================] - 11s 37ms/step - loss: 0.7193 - accuracy: 0.7296 - val_loss: 0.6848 - val_accuracy: 0.7386
Epoch 8/30
282/282 [==============================] - 11s 37ms/step - loss: 0.6616 - accuracy: 0.7529 - val_loss: 0.6420 - val_accuracy: 0.7577
Epoch 9/30
282/282 [==============================] - 11s 37ms/step - loss: 0.6194 - accuracy: 0.7652 - val_loss: 0.5948 - val_accuracy: 0.7697
Epoch 10/30
282/282 [==============================] - 11s 38ms/step - loss: 0.5769 - accuracy: 0.7806 - val_loss: 0.5939 - val_accuracy: 0.7626
Epoch 11/30
282/282 [==============================] - 11s 38ms/step - loss: 0.5427 - accuracy: 0.7925 - val_loss: 0.5238 - val_accuracy: 0.7942
Epoch 12/30
282/282 [==============================] - 11s 38ms/step - loss: 0.4977 - accuracy: 0.8093 - val_loss: 0.4988 - val_accuracy: 0.8008
Epoch 13/30
282/282 [==============================] - 11s 38ms/step - loss: 0.4652 - accuracy: 0.8203 - val_loss: 0.4432 - val_accuracy: 0.8276
Epoch 14/30
282/282 [==============================] - 11s 38ms/step - loss: 0.4127 - accuracy: 0.8397 - val_loss: 0.3641 - val_accuracy: 0.8566
Epoch 15/30
282/282 [==============================] - 11s 40ms/step - loss: 0.3344 - accuracy: 0.8739 - val_loss: 0.3037 - val_accuracy: 0.8810
Epoch 16/30
282/282 [==============================] - 12s 42ms/step - loss: 0.2560 - accuracy: 0.9070 - val_loss: 0.2081 - val_accuracy: 0.9239
Epoch 17/30
282/282 [==============================] - 11s 40ms/step - loss: 0.1846 - accuracy: 0.9365 - val_loss: 0.1693 - val_accuracy: 0.9386
Epoch 18/30
282/282 [==============================] - 11s 40ms/step - loss: 0.1471 - accuracy: 0.9495 - val_loss: 0.1273 - val_accuracy: 0.9556
Epoch 19/30
282/282 [==============================] - 11s 39ms/step - loss: 0.1071 - accuracy: 0.9669 - val_loss: 0.1022 - val_accuracy: 0.9657
Epoch 20/30
282/282 [==============================] - 11s 39ms/step - loss: 0.0847 - accuracy: 0.9752 - val_loss: 0.0810 - val_accuracy: 0.9739
Epoch 21/30
282/282 [==============================] - 11s 38ms/step - loss: 0.0669 - accuracy: 0.9814 - val_loss: 0.0973 - val_accuracy: 0.9646
Epoch 22/30
282/282 [==============================] - 11s 38ms/step - loss: 0.0819 - accuracy: 0.9727 - val_loss: 0.0619 - val_accuracy: 0.9805
Epoch 23/30
282/282 [==============================] - 11s 38ms/step - loss: 0.0446 - accuracy: 0.9891 - val_loss: 0.0739 - val_accuracy: 0.9738
Epoch 24/30
282/282 [==============================] - 11s 38ms/step - loss: 0.0472 - accuracy: 0.9863 - val_loss: 0.0718 - val_accuracy: 0.9744
Epoch 25/30
282/282 [==============================] - 11s 38ms/step - loss: 0.0457 - accuracy: 0.9869 - val_loss: 0.0724 - val_accuracy: 0.9752
Epoch 26/30
282/282 [==============================] - 11s 39ms/step - loss: 0.0468 - accuracy: 0.9857 - val_loss: 0.0486 - val_accuracy: 0.9840
Epoch 27/30
282/282 [==============================] - 11s 38ms/step - loss: 0.0255 - accuracy: 0.9940 - val_loss: 0.0339 - val_accuracy: 0.9888
Epoch 28/30
282/282 [==============================] - 11s 38ms/step - loss: 0.0174 - accuracy: 0.9966 - val_loss: 0.0636 - val_accuracy: 0.9782
Epoch 29/30
282/282 [==============================] - 11s 38ms/step - loss: 0.0625 - accuracy: 0.9786 - val_loss: 0.0401 - val_accuracy: 0.9864
Epoch 30/30
282/282 [==============================] - 11s 38ms/step - loss: 0.0248 - accuracy: 0.9932 - val_loss: 0.0343 - val_accuracy: 0.9883

Model 3 (Birdirectional)#

Define Model#

  • Important highlights:

    • In Model 3, we implement a bi-directional Encoder.

    • At each encoding step, there will be two hidden states (i.e., forward and backward passes)

# Define Model Inputs
encoder_inputs = Input(shape=(input_maxlen, input_vsize),
                       name='encoder_inputs')
decoder_inputs = Input(shape=(target_maxlen - 1, target_vsize),
                       name='decoder_inputs')

# Encoder GRU
encoder_gru = Bidirectional(
    GRU(latent_dim,
        return_sequences=True,
        return_state=True,
        name='encoder_gru'))
_, encoder_state_fwd, encoder_state_bwd = encoder_gru(encoder_inputs)

# Combine forward and backward state (last h's) from encoder
encoder_state = Concatenate(axis=-1)([encoder_state_fwd, encoder_state_bwd])

# Decoder GRU
    # using `encoder_state` as initial state
    # the latent_dim *2 because we use two last states from the bidirectional encoder
decoder_gru = GRU(latent_dim * 2,
                  return_sequences=True,
                  return_state=True,
                  name='decoder_gru')
decoder_out, _ = decoder_gru(decoder_inputs, initial_state=encoder_state)

# Dense layer
dense = Dense(target_vsize, activation='softmax', name='softmax_layer')
dense_time = TimeDistributed(dense, name='time_distributed_layer')
decoder_pred = dense_time(decoder_out)

# Full model
full_model3 = Model(inputs=[encoder_inputs, decoder_inputs],
                    outputs=decoder_pred)
full_model3.summary()
Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
==================================================================================================
encoder_inputs (InputLayer)     [(None, 7, 12)]      0                                            
__________________________________________________________________________________________________
bidirectional (Bidirectional)   [(None, 7, 512), (No 414720      encoder_inputs[0][0]             
__________________________________________________________________________________________________
decoder_inputs (InputLayer)     [(None, 5, 12)]      0                                            
__________________________________________________________________________________________________
concatenate (Concatenate)       (None, 512)          0           bidirectional[0][1]              
                                                                 bidirectional[0][2]              
__________________________________________________________________________________________________
decoder_gru (GRU)               [(None, 5, 512), (No 807936      decoder_inputs[0][0]             
                                                                 concatenate[0][0]                
__________________________________________________________________________________________________
time_distributed_layer (TimeDis (None, 5, 12)        6156        decoder_gru[0][0]                
==================================================================================================
Total params: 1,228,812
Trainable params: 1,228,812
Non-trainable params: 0
__________________________________________________________________________________________________
plot_model(full_model3, show_shapes=True)
../_images/78a14d31602929b0032e115d8ebb7b0525287389de4d900cf6f24dfb837ec94d.png

Training#

# Run training
full_model3.compile(optimizer='adam',
                    loss='categorical_crossentropy',
                    metrics=['accuracy'])
history3 = full_model3.fit([encoder_input_onehot, decoder_input_onehot],
                           decoder_output_onehot,
                           batch_size=batch_size,
                           epochs=epochs,
                           validation_split=0.2)
Hide code cell output
Epoch 1/30
282/282 [==============================] - 33s 103ms/step - loss: 1.6079 - accuracy: 0.4403 - val_loss: 1.3156 - val_accuracy: 0.5128
Epoch 2/30
282/282 [==============================] - 27s 95ms/step - loss: 1.1885 - accuracy: 0.5518 - val_loss: 0.9492 - val_accuracy: 0.6399
Epoch 3/30
282/282 [==============================] - 27s 97ms/step - loss: 0.8935 - accuracy: 0.6571 - val_loss: 0.7808 - val_accuracy: 0.6945
Epoch 4/30
282/282 [==============================] - 27s 97ms/step - loss: 0.7333 - accuracy: 0.7202 - val_loss: 0.6747 - val_accuracy: 0.7417
Epoch 5/30
282/282 [==============================] - 28s 99ms/step - loss: 0.6354 - accuracy: 0.7590 - val_loss: 0.5728 - val_accuracy: 0.7769
Epoch 6/30
282/282 [==============================] - 27s 97ms/step - loss: 0.5219 - accuracy: 0.8037 - val_loss: 0.4832 - val_accuracy: 0.8168
Epoch 7/30
282/282 [==============================] - 28s 99ms/step - loss: 0.4519 - accuracy: 0.8306 - val_loss: 0.4275 - val_accuracy: 0.8352
Epoch 8/30
282/282 [==============================] - 28s 99ms/step - loss: 0.3422 - accuracy: 0.8701 - val_loss: 0.1728 - val_accuracy: 0.9397
Epoch 9/30
282/282 [==============================] - 27s 97ms/step - loss: 0.1435 - accuracy: 0.9542 - val_loss: 0.1069 - val_accuracy: 0.9650
Epoch 10/30
282/282 [==============================] - 28s 99ms/step - loss: 0.0759 - accuracy: 0.9804 - val_loss: 0.1172 - val_accuracy: 0.9587
Epoch 11/30
282/282 [==============================] - 27s 97ms/step - loss: 0.0760 - accuracy: 0.9765 - val_loss: 0.0372 - val_accuracy: 0.9914
Epoch 12/30
282/282 [==============================] - 29s 104ms/step - loss: 0.0314 - accuracy: 0.9938 - val_loss: 0.2150 - val_accuracy: 0.9321
Epoch 13/30
282/282 [==============================] - 30s 105ms/step - loss: 0.0997 - accuracy: 0.9688 - val_loss: 0.0280 - val_accuracy: 0.9932
Epoch 14/30
282/282 [==============================] - 30s 107ms/step - loss: 0.0184 - accuracy: 0.9968 - val_loss: 0.0299 - val_accuracy: 0.9907
Epoch 15/30
282/282 [==============================] - 29s 103ms/step - loss: 0.0411 - accuracy: 0.9874 - val_loss: 0.0220 - val_accuracy: 0.9946
Epoch 16/30
282/282 [==============================] - 28s 101ms/step - loss: 0.0143 - accuracy: 0.9974 - val_loss: 0.0168 - val_accuracy: 0.9954
Epoch 17/30
282/282 [==============================] - 30s 106ms/step - loss: 0.0137 - accuracy: 0.9969 - val_loss: 0.1130 - val_accuracy: 0.9617
Epoch 18/30
282/282 [==============================] - 30s 106ms/step - loss: 0.0519 - accuracy: 0.9834 - val_loss: 0.0137 - val_accuracy: 0.9964
Epoch 19/30
282/282 [==============================] - 30s 108ms/step - loss: 0.0073 - accuracy: 0.9990 - val_loss: 0.0128 - val_accuracy: 0.9964
Epoch 20/30
282/282 [==============================] - 30s 107ms/step - loss: 0.0317 - accuracy: 0.9897 - val_loss: 0.0216 - val_accuracy: 0.9934
Epoch 21/30
282/282 [==============================] - 30s 108ms/step - loss: 0.0147 - accuracy: 0.9964 - val_loss: 0.0442 - val_accuracy: 0.9854
Epoch 22/30
282/282 [==============================] - 30s 106ms/step - loss: 0.0246 - accuracy: 0.9921 - val_loss: 0.0129 - val_accuracy: 0.9960
Epoch 23/30
282/282 [==============================] - 30s 108ms/step - loss: 0.0072 - accuracy: 0.9986 - val_loss: 0.0363 - val_accuracy: 0.9874
Epoch 24/30
282/282 [==============================] - 30s 105ms/step - loss: 0.0499 - accuracy: 0.9830 - val_loss: 0.0080 - val_accuracy: 0.9979
Epoch 25/30
282/282 [==============================] - 30s 106ms/step - loss: 0.0036 - accuracy: 0.9997 - val_loss: 0.0084 - val_accuracy: 0.9976
Epoch 26/30
282/282 [==============================] - 30s 106ms/step - loss: 0.0091 - accuracy: 0.9976 - val_loss: 0.0495 - val_accuracy: 0.9825
Epoch 27/30
282/282 [==============================] - 29s 104ms/step - loss: 0.0271 - accuracy: 0.9911 - val_loss: 0.0102 - val_accuracy: 0.9972
Epoch 28/30
282/282 [==============================] - 31s 111ms/step - loss: 0.0088 - accuracy: 0.9977 - val_loss: 0.0421 - val_accuracy: 0.9854
Epoch 29/30
282/282 [==============================] - 30s 107ms/step - loss: 0.0404 - accuracy: 0.9876 - val_loss: 0.0074 - val_accuracy: 0.9980
Epoch 30/30
282/282 [==============================] - 30s 107ms/step - loss: 0.0058 - accuracy: 0.9986 - val_loss: 0.0093 - val_accuracy: 0.9970

Model 4 (Peeky Decoder)#

Define Model#

  • Important highlights:

    • In the previous models, Decoder only utilizes Encoder’s last hidden state for the decoding of the first output. As for the subsequent decoding time steps, Decoder does not have any information from Encoder.

    • In Model 4, we implement a peeky Decoder. This strategy allows Decoder to access the information (last hidden state) of the Encoder in every decoding time step.

# Define Model Inputs
encoder_inputs = Input(shape=(input_maxlen, input_vsize),
                       name='encoder_inputs')
decoder_inputs = Input(shape=(target_maxlen - 1, target_vsize),
                       name='decoder_inputs')

# Encoder GRU
encoder_gru = Bidirectional(
    GRU(latent_dim,
        return_sequences=True,
        return_state=True,
        name='encoder_gru'))
_, encoder_state_fwd, encoder_state_bwd = encoder_gru(encoder_inputs)

# Combine forward and backward state (last h's) from encoder
encoder_state = Concatenate(axis=-1)([encoder_state_fwd, encoder_state_bwd])


# Repeat the last-hidden-state of Encoder
encoder_state_repeated = RepeatVector(target_maxlen - 1)(encoder_state)

## Concatenate every decoder input with the encoder_state
decoder_inputs_peeky = Concatenate(axis=2)(
    [decoder_inputs, encoder_state_repeated])

# Decoder GRU
decoder_gru = GRU(latent_dim * 2,
                  return_sequences=True,
                  return_state=True,
                  name='decoder_gru')
decoder_out, _ = decoder_gru(decoder_inputs_peeky, initial_state=encoder_state)

# Dense layer
dense = Dense(target_vsize, activation='softmax', name='softmax_layer')
dense_time = TimeDistributed(dense, name='time_distributed_layer')
decoder_pred = dense_time(decoder_out)

# Full model
full_model4 = Model(inputs=[encoder_inputs, decoder_inputs],
                    outputs=decoder_pred)

full_model4.summary()
Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
==================================================================================================
encoder_inputs (InputLayer)     [(None, 7, 12)]      0                                            
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) [(None, 7, 512), (No 414720      encoder_inputs[0][0]             
__________________________________________________________________________________________________
concatenate_1 (Concatenate)     (None, 512)          0           bidirectional_1[0][1]            
                                                                 bidirectional_1[0][2]            
__________________________________________________________________________________________________
decoder_inputs (InputLayer)     [(None, 5, 12)]      0                                            
__________________________________________________________________________________________________
repeat_vector (RepeatVector)    (None, 5, 512)       0           concatenate_1[0][0]              
__________________________________________________________________________________________________
concatenate_2 (Concatenate)     (None, 5, 524)       0           decoder_inputs[0][0]             
                                                                 repeat_vector[0][0]              
__________________________________________________________________________________________________
decoder_gru (GRU)               [(None, 5, 512), (No 1594368     concatenate_2[0][0]              
                                                                 concatenate_1[0][0]              
__________________________________________________________________________________________________
time_distributed_layer (TimeDis (None, 5, 12)        6156        decoder_gru[0][0]                
==================================================================================================
Total params: 2,015,244
Trainable params: 2,015,244
Non-trainable params: 0
__________________________________________________________________________________________________
plot_model(full_model4, show_shapes=True)
../_images/762645d79cc242050d7c8356d4f87bd9e29a8802a5d164553bd9824841e28857.png

Training#

# Run training
full_model4.compile(optimizer='adam',
                    loss='categorical_crossentropy',
                    metrics=['accuracy'])
history4 = full_model4.fit([encoder_input_onehot, decoder_input_onehot],
                           decoder_output_onehot,
                           batch_size=batch_size,
                           epochs=epochs,
                           validation_split=0.2)
Hide code cell output
Epoch 1/30
282/282 [==============================] - 50s 162ms/step - loss: 1.5986 - accuracy: 0.4271 - val_loss: 1.2786 - val_accuracy: 0.5222
Epoch 2/30
282/282 [==============================] - 42s 150ms/step - loss: 1.1654 - accuracy: 0.5591 - val_loss: 0.9077 - val_accuracy: 0.6633
Epoch 3/30
282/282 [==============================] - 43s 151ms/step - loss: 0.8362 - accuracy: 0.6821 - val_loss: 0.6661 - val_accuracy: 0.7411
Epoch 4/30
282/282 [==============================] - 41s 146ms/step - loss: 0.6143 - accuracy: 0.7663 - val_loss: 0.4927 - val_accuracy: 0.8118
Epoch 5/30
282/282 [==============================] - 41s 147ms/step - loss: 0.4099 - accuracy: 0.8465 - val_loss: 0.2593 - val_accuracy: 0.9028
Epoch 6/30
282/282 [==============================] - 42s 147ms/step - loss: 0.1960 - accuracy: 0.9337 - val_loss: 0.1425 - val_accuracy: 0.9537
Epoch 7/30
282/282 [==============================] - 41s 146ms/step - loss: 0.1075 - accuracy: 0.9707 - val_loss: 0.1618 - val_accuracy: 0.9408
Epoch 8/30
282/282 [==============================] - 42s 147ms/step - loss: 0.0851 - accuracy: 0.9751 - val_loss: 0.0522 - val_accuracy: 0.9863
Epoch 9/30
282/282 [==============================] - 42s 149ms/step - loss: 0.0640 - accuracy: 0.9817 - val_loss: 0.0374 - val_accuracy: 0.9918
Epoch 10/30
282/282 [==============================] - 41s 145ms/step - loss: 0.0339 - accuracy: 0.9923 - val_loss: 0.0791 - val_accuracy: 0.9717
Epoch 11/30
282/282 [==============================] - 40s 141ms/step - loss: 0.0620 - accuracy: 0.9803 - val_loss: 0.0206 - val_accuracy: 0.9959
Epoch 12/30
282/282 [==============================] - 40s 141ms/step - loss: 0.0148 - accuracy: 0.9977 - val_loss: 0.0166 - val_accuracy: 0.9960
Epoch 13/30
282/282 [==============================] - 41s 145ms/step - loss: 0.0557 - accuracy: 0.9829 - val_loss: 0.0198 - val_accuracy: 0.9957
Epoch 14/30
282/282 [==============================] - 43s 151ms/step - loss: 0.0119 - accuracy: 0.9984 - val_loss: 0.0168 - val_accuracy: 0.9952
Epoch 15/30
282/282 [==============================] - 41s 144ms/step - loss: 0.0140 - accuracy: 0.9966 - val_loss: 0.1039 - val_accuracy: 0.9615
Epoch 16/30
282/282 [==============================] - 40s 140ms/step - loss: 0.0588 - accuracy: 0.9795 - val_loss: 0.0114 - val_accuracy: 0.9972
Epoch 17/30
282/282 [==============================] - 40s 143ms/step - loss: 0.0085 - accuracy: 0.9984 - val_loss: 0.0370 - val_accuracy: 0.9867
Epoch 18/30
282/282 [==============================] - 40s 144ms/step - loss: 0.0661 - accuracy: 0.9778 - val_loss: 0.0124 - val_accuracy: 0.9970
Epoch 19/30
282/282 [==============================] - 40s 141ms/step - loss: 0.0062 - accuracy: 0.9991 - val_loss: 0.0195 - val_accuracy: 0.9936
Epoch 20/30
282/282 [==============================] - 41s 144ms/step - loss: 0.0441 - accuracy: 0.9851 - val_loss: 0.0096 - val_accuracy: 0.9979
Epoch 21/30
282/282 [==============================] - 40s 143ms/step - loss: 0.0280 - accuracy: 0.9909 - val_loss: 0.0191 - val_accuracy: 0.9945
Epoch 22/30
282/282 [==============================] - 41s 145ms/step - loss: 0.0089 - accuracy: 0.9981 - val_loss: 0.0064 - val_accuracy: 0.9984
Epoch 23/30
282/282 [==============================] - 39s 139ms/step - loss: 0.0055 - accuracy: 0.9987 - val_loss: 0.0730 - val_accuracy: 0.9732
Epoch 24/30
282/282 [==============================] - 40s 142ms/step - loss: 0.0567 - accuracy: 0.9802 - val_loss: 0.0076 - val_accuracy: 0.9982
Epoch 25/30
282/282 [==============================] - 39s 139ms/step - loss: 0.0030 - accuracy: 0.9997 - val_loss: 0.0048 - val_accuracy: 0.9986
Epoch 26/30
282/282 [==============================] - 39s 139ms/step - loss: 0.0022 - accuracy: 0.9997 - val_loss: 0.0069 - val_accuracy: 0.9979
Epoch 27/30
282/282 [==============================] - 40s 140ms/step - loss: 0.0652 - accuracy: 0.9788 - val_loss: 0.0092 - val_accuracy: 0.9978
Epoch 28/30
282/282 [==============================] - 39s 138ms/step - loss: 0.0041 - accuracy: 0.9994 - val_loss: 0.0051 - val_accuracy: 0.9989
Epoch 29/30
282/282 [==============================] - 39s 139ms/step - loss: 0.0044 - accuracy: 0.9991 - val_loss: 0.0062 - val_accuracy: 0.9982
Epoch 30/30
282/282 [==============================] - 40s 142ms/step - loss: 0.0079 - accuracy: 0.9976 - val_loss: 0.0614 - val_accuracy: 0.9776

Model 5 (Attention)#

Define Model#

  • Important highlights:

    • In Model 5, we implement an Attention-based Decoder.

    • This Attention mechanism allows the Decoder to use Encoder’s all hidden states.

# Define an input sequence and process it.
encoder_inputs = Input(shape=(input_maxlen, input_vsize),
                       name='encoder_inputs')
decoder_inputs = Input(shape=(target_maxlen - 1, target_vsize),
                       name='decoder_inputs')
# Encoder GRU
encoder_gru = GRU(latent_dim,
                  return_sequences=True,
                  return_state=True,
                  name='encoder_gru')
encoder_out, encoder_state = encoder_gru(encoder_inputs)

# Decoder GRU
decoder_gru = GRU(latent_dim,
                  return_sequences=True,
                  return_state=True,
                  name='decoder_gru')
decoder_out, decoder_state = decoder_gru(decoder_inputs,
                                         initial_state=encoder_state)

# Attention layer
attn_layer = Attention(name="attention_layer")

## The inputs for Attention:
##  `query`: the `decoder_out` = decoder's hidden state at the decoding step
##  `value` & `key`: the `encoder_out` = encoder's all hidden states
## It returns a tensor of shape as `query`, i.e., context tensor

attn_out, attn_weights = attn_layer([decoder_out, encoder_out],
                                    return_attention_scores=True)

# Concat context tensor + decoder_out
decoder_concat_input = Concatenate(
    axis=-1, name='concat_layer')([decoder_out, attn_out])

# Dense layer
dense = Dense(target_vsize, activation='softmax', name='softmax_layer')
dense_time = TimeDistributed(dense, name='time_distributed_layer')
decoder_pred = dense_time(decoder_concat_input)

# Full model
full_model5 = Model(inputs=[encoder_inputs, decoder_inputs],
                    outputs=decoder_pred)
full_model5.summary()
Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
==================================================================================================
encoder_inputs (InputLayer)     [(None, 7, 12)]      0                                            
__________________________________________________________________________________________________
decoder_inputs (InputLayer)     [(None, 5, 12)]      0                                            
__________________________________________________________________________________________________
encoder_gru (GRU)               [(None, 7, 256), (No 207360      encoder_inputs[0][0]             
__________________________________________________________________________________________________
decoder_gru (GRU)               [(None, 5, 256), (No 207360      decoder_inputs[0][0]             
                                                                 encoder_gru[0][1]                
__________________________________________________________________________________________________
attention_layer (Attention)     ((None, 5, 256), (No 0           decoder_gru[0][0]                
                                                                 encoder_gru[0][0]                
__________________________________________________________________________________________________
concat_layer (Concatenate)      (None, 5, 512)       0           decoder_gru[0][0]                
                                                                 attention_layer[0][0]            
__________________________________________________________________________________________________
time_distributed_layer (TimeDis (None, 5, 12)        6156        concat_layer[0][0]               
==================================================================================================
Total params: 420,876
Trainable params: 420,876
Non-trainable params: 0
__________________________________________________________________________________________________
plot_model(full_model5, show_shapes=True)
../_images/1380877a6a8bf55eeb114fe372b41dd01257b5efa8a03908a29d1c9a6e3b4152.png

Training#

# Run training
full_model5.compile(optimizer='adam',
                    loss='categorical_crossentropy',
                    metrics=['accuracy'])
history5 = full_model5.fit([encoder_input_onehot, decoder_input_onehot],
                           decoder_output_onehot,
                           batch_size=batch_size,
                           epochs=epochs,
                           validation_split=0.2)
Hide code cell output
Epoch 1/30
282/282 [==============================] - 15s 42ms/step - loss: 1.6877 - accuracy: 0.4021 - val_loss: 1.3822 - val_accuracy: 0.4808
Epoch 2/30
282/282 [==============================] - 11s 39ms/step - loss: 1.3569 - accuracy: 0.4933 - val_loss: 1.2155 - val_accuracy: 0.5394
Epoch 3/30
282/282 [==============================] - 11s 39ms/step - loss: 1.1449 - accuracy: 0.5621 - val_loss: 1.0299 - val_accuracy: 0.6038
Epoch 4/30
282/282 [==============================] - 11s 39ms/step - loss: 1.0067 - accuracy: 0.6153 - val_loss: 0.9458 - val_accuracy: 0.6377
Epoch 5/30
282/282 [==============================] - 11s 39ms/step - loss: 0.9173 - accuracy: 0.6472 - val_loss: 0.8299 - val_accuracy: 0.6754
Epoch 6/30
282/282 [==============================] - 11s 39ms/step - loss: 0.8073 - accuracy: 0.6889 - val_loss: 0.7581 - val_accuracy: 0.7086
Epoch 7/30
282/282 [==============================] - 11s 39ms/step - loss: 0.7421 - accuracy: 0.7178 - val_loss: 0.7211 - val_accuracy: 0.7213
Epoch 8/30
282/282 [==============================] - 11s 40ms/step - loss: 0.7009 - accuracy: 0.7345 - val_loss: 0.6750 - val_accuracy: 0.7449
Epoch 9/30
282/282 [==============================] - 11s 39ms/step - loss: 0.6572 - accuracy: 0.7516 - val_loss: 0.6387 - val_accuracy: 0.7535
Epoch 10/30
282/282 [==============================] - 11s 39ms/step - loss: 0.6150 - accuracy: 0.7649 - val_loss: 0.6060 - val_accuracy: 0.7602
Epoch 11/30
282/282 [==============================] - 11s 39ms/step - loss: 0.5754 - accuracy: 0.7778 - val_loss: 0.5544 - val_accuracy: 0.7815
Epoch 12/30
282/282 [==============================] - 11s 39ms/step - loss: 0.5373 - accuracy: 0.7911 - val_loss: 0.5174 - val_accuracy: 0.7938
Epoch 13/30
282/282 [==============================] - 11s 39ms/step - loss: 0.4893 - accuracy: 0.8087 - val_loss: 0.4801 - val_accuracy: 0.8064
Epoch 14/30
282/282 [==============================] - 11s 39ms/step - loss: 0.4277 - accuracy: 0.8358 - val_loss: 0.3971 - val_accuracy: 0.8439
Epoch 15/30
282/282 [==============================] - 11s 38ms/step - loss: 0.3748 - accuracy: 0.8565 - val_loss: 0.3449 - val_accuracy: 0.8666
Epoch 16/30
282/282 [==============================] - 11s 39ms/step - loss: 0.3049 - accuracy: 0.8856 - val_loss: 0.2822 - val_accuracy: 0.8943
Epoch 17/30
282/282 [==============================] - 11s 39ms/step - loss: 0.2494 - accuracy: 0.9105 - val_loss: 0.2310 - val_accuracy: 0.9134
Epoch 18/30
282/282 [==============================] - 11s 39ms/step - loss: 0.1963 - accuracy: 0.9309 - val_loss: 0.1759 - val_accuracy: 0.9363
Epoch 19/30
282/282 [==============================] - 11s 40ms/step - loss: 0.1470 - accuracy: 0.9511 - val_loss: 0.1480 - val_accuracy: 0.9485
Epoch 20/30
282/282 [==============================] - 11s 40ms/step - loss: 0.1082 - accuracy: 0.9688 - val_loss: 0.1172 - val_accuracy: 0.9612
Epoch 21/30
282/282 [==============================] - 12s 44ms/step - loss: 0.0887 - accuracy: 0.9750 - val_loss: 0.1146 - val_accuracy: 0.9598
Epoch 22/30
282/282 [==============================] - 13s 46ms/step - loss: 0.0738 - accuracy: 0.9805 - val_loss: 0.0870 - val_accuracy: 0.9717
Epoch 23/30
282/282 [==============================] - 13s 45ms/step - loss: 0.0655 - accuracy: 0.9821 - val_loss: 0.0683 - val_accuracy: 0.9788
Epoch 24/30
282/282 [==============================] - 13s 46ms/step - loss: 0.0520 - accuracy: 0.9871 - val_loss: 0.0820 - val_accuracy: 0.9724
Epoch 25/30
282/282 [==============================] - 13s 45ms/step - loss: 0.0640 - accuracy: 0.9802 - val_loss: 0.0594 - val_accuracy: 0.9820
Epoch 26/30
282/282 [==============================] - 12s 43ms/step - loss: 0.0428 - accuracy: 0.9892 - val_loss: 0.0678 - val_accuracy: 0.9772
Epoch 27/30
282/282 [==============================] - 11s 39ms/step - loss: 0.0400 - accuracy: 0.9892 - val_loss: 0.0724 - val_accuracy: 0.9751
Epoch 28/30
282/282 [==============================] - 11s 40ms/step - loss: 0.0531 - accuracy: 0.9843 - val_loss: 0.0412 - val_accuracy: 0.9878
Epoch 29/30
282/282 [==============================] - 12s 42ms/step - loss: 0.0284 - accuracy: 0.9936 - val_loss: 0.0672 - val_accuracy: 0.9770
Epoch 30/30
282/282 [==============================] - 11s 39ms/step - loss: 0.0516 - accuracy: 0.9840 - val_loss: 0.0346 - val_accuracy: 0.9903
plot_model(full_model5, show_shapes=True)
../_images/1380877a6a8bf55eeb114fe372b41dd01257b5efa8a03908a29d1c9a6e3b4152.png
plot1(history5)
../_images/c3fe4a5c07ae3d3d1a63dd9cb841f4351a95e2c7c4a9c78fa851369b4422a372.png ../_images/57fe22941736e95cf81e6817fe1a633968551279f3567877b4e30ebcb5e38402.png

Save Models#

# # Save model
full_model5.save('keras_models/s2s-addition-attention.h5')

Interim Comparison#

history = [history1, history2, history3, history4, history5]
history = [i.history for i in history]
model_names = ['VanilaRNN', 'GRU', 'Birdirectional', 'Peeky', 'Attention']
# ## Saving all training histories
# import pickle
# with open('keras_models/s2s-attention-addition-history', 'wb') as f:
#     pickle.dump(history, f)
# with open('keras_models/s2s-attention-addition-history', 'rb') as f:
#     history = pickle.load(f)
acc = [i['accuracy'] for i in history]
val_acc = [i['val_accuracy'] for i in history]

plt.figure(figsize=(10, 8))
plt.style.use('fivethirtyeight')
for i, a in enumerate(acc):
    plt.plot(range(len(a) + 1), [0] + a,
             linestyle='--',
             marker='o',
             linewidth=1,
             label=model_names[i])
plt.legend()
plt.title('Comparing Different Sequence Models')
plt.xlabel('epochs')
plt.ylabel('accuracy')
plt.tight_layout()
plt.show()
../_images/68f9c37e51803dc5dd9fa4040ea1420b4f0911336935183d0b11bce4d0c82650.png
loss = [i['loss'] for i in history]

plt.figure(figsize=(10, 8))
plt.style.use('fivethirtyeight')
for i, a in enumerate(loss):
    plt.plot(range(len(a)),
             a,
             linestyle='--',
             marker='o',
             linewidth=1,
             label=model_names[i])
plt.legend()
plt.title('Comparing Different Sequence Models')
plt.xlabel('epochs')
plt.ylabel('loss')
plt.tight_layout()
plt.show()
../_images/8ea0b9998141f0eca677de08efcf856c942f6e43badb5fa883f463f2c76b1f84.png

Attention Model Analysis#

# ## If the model is loaded via external files
# ## Load the encoder_model, decoder_model this way
# from keras.models import load_model
# full_model5.load_weights('keras_models/s2s-addition-attention.h5')
# full_model5.compile(optimizer='adam',
#                     loss='categorical_crossentropy',
#                     metrics=['accuracy'])
## Let's look at the attention-based model
full_model = full_model5

Inference#

  • At the inference stage, we use the trained model to decode input sequences.

  • In decoding, it should be noted that Decoder would decode one word at a time.

  • We set up Inference-Encoder and Inference-Decoder based on the trained model. We need to identify the right layer from the trained model for the use in inference.

full_model.summary()
Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
==================================================================================================
encoder_inputs (InputLayer)     [(None, 7, 12)]      0                                            
__________________________________________________________________________________________________
decoder_inputs (InputLayer)     [(None, 5, 12)]      0                                            
__________________________________________________________________________________________________
encoder_gru (GRU)               [(None, 7, 256), (No 207360      encoder_inputs[0][0]             
__________________________________________________________________________________________________
decoder_gru (GRU)               [(None, 5, 256), (No 207360      decoder_inputs[0][0]             
                                                                 encoder_gru[0][1]                
__________________________________________________________________________________________________
attention_layer (Attention)     ((None, 5, 256), (No 0           decoder_gru[0][0]                
                                                                 encoder_gru[0][0]                
__________________________________________________________________________________________________
concat_layer (Concatenate)      (None, 5, 512)       0           decoder_gru[0][0]                
                                                                 attention_layer[0][0]            
__________________________________________________________________________________________________
time_distributed_layer (TimeDis (None, 5, 12)        6156        concat_layer[0][0]               
==================================================================================================
Total params: 420,876
Trainable params: 420,876
Non-trainable params: 0
__________________________________________________________________________________________________

Inference Encoder#

## Inference-Encoder
encoder_inf_inputs = full_model.input[0]
encoder_inf_out, encoder_inf_state = full_model.layers[2].output
encoder_inf_model = Model(inputs=encoder_inf_inputs,
                          outputs=[encoder_inf_out, encoder_inf_state])
plot_model(encoder_inf_model)
../_images/7747642379b4875d8b5cf75acf42040141d91984d733e5a904cdd584e5aa360f.png

Inference Decoder#

  • Inference-Decoder requires two inputs:

    • Encoder’s last hidden state as its initial hidden state

    • The input token of the target sequence (default start token: _)

  • Inference-Attention requires two inputs:

    • Encoder’s all hidden states as the values and keys

    • Inference-Decoder’s hidden state as the query

## Inference-Decoder Input (1): The input token from the target sequence 
    ## one token at each time
    ## the default is the start token '_'
decoder_inf_inputs = Input(
    shape=(1, target_vsize),
    name='decoder_inf_inputs')  ## Initial Decoder's Output Token '_'

## Inference-Decoder Input (2): All hidden states from Inference-Encoder
encoder_inf_states = Input(
    shape=(input_maxlen, latent_dim),
    name='encoder_inf_states')

## Inference-Decoder Initial Hidden State = Inference-Encoder's last h
decoder_init_state = Input(shape=(latent_dim),
                           name='decoder_init')  ## initial c from encoder

## Inference-Decoder
decoder_inf_gru = full_model.layers[3]
decoder_inf_out, decoder_inf_state = decoder_inf_gru(
    decoder_inf_inputs, initial_state=decoder_init_state)


## Inference-Attention
decoder_inf_attention = full_model.layers[4]
attn_inf_out, attn_inf_weights = decoder_inf_attention(
    [decoder_inf_out, encoder_inf_states], return_attention_scores=True)

## Inference-Concatenate
decoder_inf_concat = Concatenate(
    axis=-1, name='concat')([decoder_inf_out, attn_inf_out])

## Inference-Dense
decoder_inf_timedense = full_model.layers[6]
decoder_inf_pred = decoder_inf_timedense(decoder_inf_concat)

## Inference-Model
decoder_inf_model = Model(
    inputs=[encoder_inf_states, decoder_init_state, decoder_inf_inputs],
    outputs=[decoder_inf_pred, attn_inf_weights, decoder_inf_state])
plot_model(decoder_inf_model, show_shapes=True)
../_images/d0d3d34e8ccd067670bb299fb9aaa4db999114250656d67844a1d05680fd4e86.png

Decoding Input Sequences#

  • The Inference-Encoder processes the tokens of input sequences to get (a) all hidden states, and (b) the last hidden state.

  • The Inference-Decoder uses the Inference-Encoder’s last hidden state as the initial hidden state.

  • The Inference-Decoder uses the token _ as the initial token of the target sequence for decoding.

  • At the subsequent decoding steps, Inference-Decoder updates its hidden state and the decoded token as inputs for the next-round decoding.

  • Inference-Decoder is different from the training Decoder in that:

    • The latter takes in all the correct target sequences (i.e., decoder_input_onehot) for teacher forcing.

    • The former takes in one target token, which is predicted by the Inference-Decoder at the previous decoding step.

def decode_sequence(input_seq):

    ## Initialize target output character "_"
    initial_text = '_'
    initial_seq = target_tokenizer.texts_to_sequences(initial_text)
    test_dec_onehot_seq = np.expand_dims(
        to_categorical(initial_seq, num_classes=target_vsize), 1)

    ## Inference-Encoder processes input sequence
    enc_outs, enc_last_state = encoder_inf_model.predict(input_seq)

    ## Update Inference-Decoder initial hidden state
    dec_state = enc_last_state

    
    ## Holder for attention weights and decoded texts
    attention_weights = []
    dec_text = ''

    ## Inference-Decoder decoding step-by-step
    for i in range(target_maxlen):
        dec_out, attention, dec_state = decoder_inf_model.predict(
            [enc_outs, dec_state, test_dec_onehot_seq])
         
        ## Decoded Output (one-hot to integer)
        dec_ind = np.argmax(dec_out, axis=-1)[0, 0]

        ## Stopping Condition
        if dec_ind == 0:
            break

        ## Decoded Output (integer to char)
        initial_text = dec_index2word[dec_ind]
        
        ## Decoded Output for next-round decoding
        initial_seq = [dec_ind] #target_tokenizer.texts_to_sequences(initial_text)
        test_dec_onehot_seq = np.expand_dims(
            to_categorical(initial_seq, num_classes=target_vsize), 1)
        
        ## Keep track of attention weights for current decoded output
        attention_weights.append((dec_ind, attention))

        ## Append the predicted char
        dec_text += dec_index2word[dec_ind]

    return dec_text, attention_weights
## Test sequence-decoding function
## on first 20 training samples

for seq_index in range(20):
    decoded_sentence, _ = decode_sequence(
        encoder_input_onehot[seq_index:seq_index + 1, :, :])
    print('-')
    print('Input sentence:', tr_input_texts[seq_index])
    print('Decoded sentence:', decoded_sentence)
-
Input sentence: 27+673
Decoded sentence: 700_
-
Input sentence: 153+27
Decoded sentence: 180_
-
Input sentence: 93+901
Decoded sentence: 994_
-
Input sentence: 243+678
Decoded sentence: 921_
-
Input sentence: 269+46
Decoded sentence: 315_
-
Input sentence: 235+891
Decoded sentence: 1126_
-
Input sentence: 46+290
Decoded sentence: 336_
-
Input sentence: 324+947
Decoded sentence: 1271_
-
Input sentence: 721+49
Decoded sentence: 770_
-
Input sentence: 535+7
Decoded sentence: 542_
-
Input sentence: 45+117
Decoded sentence: 162_
-
Input sentence: 669+174
Decoded sentence: 843_
-
Input sentence: 904+7
Decoded sentence: 911_
-
Input sentence: 22+731
Decoded sentence: 753_
-
Input sentence: 83+742
Decoded sentence: 825_
-
Input sentence: 678+983
Decoded sentence: 1661_
-
Input sentence: 240+42
Decoded sentence: 282_
-
Input sentence: 18+44
Decoded sentence: 62_
-
Input sentence: 4+166
Decoded sentence: 170_
-
Input sentence: 731+13
Decoded sentence: 744_

Plotting Attention#

ind = 1

doc_inputs, attention_weights = decode_sequence(encoder_input_onehot[ind:ind +
                                                                     1, :, :])
mats = []
dec_inputs = []

for dec_ind, attn in attention_weights:
    mats.append(attn.reshape(-1))
    dec_inputs.append(dec_ind)

attention_mat = np.transpose(np.array(mats))
fig, ax = plt.subplots(figsize=(5, 7))
ax.imshow(attention_mat)

ax.set_xticks(np.arange(attention_mat.shape[1]))
ax.set_yticks(np.arange(attention_mat.shape[0]))

ax.set_xticklabels(
    [dec_index2word[inp] if inp != 0 else "<PAD>" for inp in dec_inputs])
ax.set_yticklabels([
    enc_index2word[inp] if inp != 0 else "<PAD>"
    for inp in encoder_input_sequences[ind]
])

ax.tick_params(labelsize=12)
ax.tick_params(axis='x', labelrotation=90)
../_images/3da6f8ec8d6546d9e511fc24d1e31c80f056cb289d9a9949b91a9fd278383598.png

Evaluation on Testing Data#

  • Wrap text vectorization in a function.

  • Vectorize the testing data in the same way as the training data

    • Texts to sequences

    • Pad sequences

    • One-hot encode sequences

def preprocess_data(enc_tokenizer, dec_tokenizer, enc_text, dec_text,
                    enc_maxlen, dec_maxlen, enc_vsize, dec_vsize):
    enc_seq = enc_tokenizer.texts_to_sequences(enc_text)
    enc_seq = pad_sequences(enc_seq, padding='post', maxlen=enc_maxlen)
    enc_onehot = to_categorical(enc_seq, num_classes=enc_vsize)

    dec_seq = dec_tokenizer.texts_to_sequences(dec_text)
    dec_seq = pad_sequences(dec_seq, padding='post', maxlen=dec_maxlen)
    dec_onehot = to_categorical(dec_seq, num_classes=dec_vsize)
    return enc_onehot, dec_onehot
ts_encoder_input_onehot, ts_decoder_target_onehot = preprocess_data(
    input_tokenizer, target_tokenizer, ts_input_texts, ts_target_texts,
    input_maxlen, target_maxlen, input_vsize, target_vsize)
print(ts_encoder_input_onehot.shape)
print(ts_decoder_target_onehot.shape)
(5000, 7, 12)
(5000, 6, 12)
full_model5.evaluate(
    [ts_encoder_input_onehot, ts_decoder_target_onehot[:, :-1, :]],
    ts_decoder_target_onehot[:, 1:, :],
    batch_size=batch_size,
    verbose=2)
40/40 - 1s - loss: 0.0353 - accuracy: 0.9894
[0.035290032625198364, 0.9894400238990784]

References#