Neural Language Model of Chinese

Contents

Neural Language Model of Chinese#

How to development a word-level neural language model in keras
Chinese texts
Word-based neural language model based on:
- character sequences
- word sequences
Use two novels by Jing-Yong

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive

#import sys
#sys.path.insert(1, '/content/drive/My Drive/_MySyncDrive/Repository/python-notes/nlp')
import os
os.chdir('/content/drive/My Drive/_MySyncDrive/Repository/python-notes/nlp')

import string
import text_normalizer_zh as tn
import re
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r', encoding='utf-8')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text


# turn a doc into clean tokens
def clean_doc_paras(doc):
    # get content paragraphs only
    paras = [p for p in doc.split(sep="\n")[7:] if p.startswith('  ')]
    paras = [tn.remove_symbols(p) for p in paras]
    paras = [tn.remove_extra_spaces(p) for p in paras]
    return paras

def clean_doc_lines(doc):
    # get content paragraphs only
    paras = [p for p in re.split("[\n，。]", doc)[7:] if p.startswith('  ')]
    paras = [tn.remove_symbols(p) for p in paras]
    paras = [tn.remove_extra_spaces(p) for p in paras]
    return paras


# save tokens to file, one dialog per line
def save_doc(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()

# load document
in_filename = "../../../RepositoryData/data/jingyong-part-cht-utf8.txt"
doc=load_doc(in_filename)
print(doc[:200])

『金庸作品集/作者:金庸』
『狀態:已完結』
『內容簡介:
    金庸的所有的書&lt;/p&gt;
』
愛下電子書Txt版閱讀,下載和分享更多電子書請訪問:http://www.ixdzs.com,手機訪問:http://m.ixdzs.com,E-mail:support@ixdzs.com
------章節內容開始-------
天龍八部
第一章 青衫磊落險峰行
    青光閃動，一柄青鋼

# clean document
paras = clean_doc_lines(doc)
print(paras[:20])
print('Total Paragraphs: %d' % len(paras))
print('Unique Tokens: %d' % len(set(''.join(paras))))

['青光閃動', '兩人劍法迅捷', '練武廳東坐著二人', '眼見那少年與中年漢子已拆到七十余招', '便在這時', '那長須老者滿臉得色', '這老者姓左', '無量劍原分東北西三宗', '西首錦凳上所坐的則是別派人士', '當下左子穆笑道辛師妹今年派出的四名弟子', '馬五德臉上微微一紅', '左子穆心想他若是你弟子', '那姓段青年微笑道在下單名一譽字', '馬五德和段譽也是初交', '左子穆道段兄既然不是馬五哥的好朋友', '那中年漢子龔光杰巴不得師父有這句話', '段譽輕揮折扇', '他這番說什麼你師父我師父的', '龔光杰大踏步過來', '段譽道你這位大爺怎地如此狠霸霸的我平生最不愛瞧人打架']
Total Paragraphs: 34031
Unique Tokens: 3388

Line-based Language Model#

from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical, plot_model
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding

Text to Sequences#

# prepare data
data = '\n'.join(paras)  # collapse the entire corpus into one string
# prepare the tokenizer on the source text
tokenizer = Tokenizer(
    oov_token=1, char_level=True
)  ## specify the word id for unknown words + char_level tokenizer
tokenizer.fit_on_texts([data])

# determine the vocabulary size
## zero index is reserved in keras as the padding token (+1) and one unknown word id
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

# create paragraph-based sequences
sequences = list()
for line in data.split('\n'):
    encoded = tokenizer.texts_to_sequences([line])[0]
    ## For each line, after converting words into indexes
    ## prepare sequences for training
    ## given a line, w1,w2,w3,w4
    ## create input sequences:
    ## w1,w2
    ## w1,w2,w3
    ## w1,w2,w3,w4
    for i in range(1, len(encoded)):
        sequence = encoded[:i + 1]
        sequences.append(sequence)
print('Total Sequences: %d' % len(sequences))

Vocabulary Size: 3391
Total Sequences: 222346

Word ID to Texts#

# Creating a reverse dictionary
reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))
# Function takes a tokenized sentence and returns the words
def sequence_to_text(list_of_indices):
    # Looking up words in dictionary
    words = [reverse_word_map.get(letter) for letter in list_of_indices]
    return(words)

sequences[:10]

[[256, 105],
 [256, 105, 506],
 [256, 105, 506, 195],
 [82, 8],
 [82, 8, 34],
 [82, 8, 34, 90],
 [82, 8, 34, 90, 913],
 [82, 8, 34, 90, 913, 1650],
 [437, 129],
 [437, 129, 796]]

[print(sequence_to_text(s)) for s in sequences[:10]]

['青', '光']
['青', '光', '閃']
['青', '光', '閃', '動']
['兩', '人']
['兩', '人', '劍']
['兩', '人', '劍', '法']
['兩', '人', '劍', '法', '迅']
['兩', '人', '劍', '法', '迅', '捷']
['練', '武']
['練', '武', '廳']

[None, None, None, None, None, None, None, None, None, None]

Padding#

max_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
print('Max Sequence Length: %d' % max_length)

Max Sequence Length: 133

Train and Test Sets#

# split into input and output elements
sequences = array(sequences)
X, y = sequences[:,:-1],sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)

Define Model#

# define model
model = Sequential()
model.add(Embedding(vocab_size, 128, input_length=max_length-1))
model.add(LSTM(100, return_sequences=True))  # LSTM 1
model.add(LSTM(100))  # LSTM 2
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())
# compile network
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding (Embedding)        (None, 132, 128)          434048    
_________________________________________________________________
lstm (LSTM)                  (None, 132, 100)          91600     
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense (Dense)                (None, 100)               10100     
_________________________________________________________________
dense_1 (Dense)              (None, 3391)              342491    
=================================================================
Total params: 958,639
Trainable params: 958,639
Non-trainable params: 0
_________________________________________________________________
None

Train Model#

# # fit network
# model.fit(X, y, batch_size= 256, epochs=500, verbose=1)

Save Model#

# from pickle import dump
# # save the model to file
# model.save('jing-yong-line-lm-model.h5')
# # save the tokenizer
# dump(tokenizer, open('jing-yong-line-lm-tokenizer.pkl', 'wb'))

Load Model#

import pickle
model.load_weights('jing-yong-line-lm-model.h5')
pickle_in = open('jing-yong-line-lm-tokenizer.pkl',
                 "rb")
tokenizer = pickle.load(pickle_in)

Generate Sequence#

# generate a sequence from a language model
def generate_seq(model, tokenizer, max_length, seed_text, n_words):
	in_text = seed_text
	# generate a fixed number of words
	for _ in range(n_words):
		# encode the text as integer
		encoded = tokenizer.texts_to_sequences([in_text])[0]
		# pre-pad sequences to a fixed length
		encoded = pad_sequences([encoded], maxlen=max_length, padding='pre')
		# predict probabilities for each word
		yhat = model.predict_classes(encoded, verbose=0)
		# map predicted word index to word
		out_word = ''
		for word, index in tokenizer.word_index.items():
			if index == yhat:
				out_word = word
				break
		# append to input
		in_text += ' ' + out_word
	return in_text

# evaluate model
print(generate_seq(model, tokenizer, max_length-1, '師父', 100))
# print(generate_seq(model, tokenizer, max_length-1, 'Jill', 4))

師父 師 姊 娘 我 段 正 結 交 視 那 儀 雖 尚 後 裡 當 年 子 師 父 段 譽 頭 子 引 兒 兒 滑 叛 得 不 過 身 子 道 你 你 什 了 身 子 李 延 慶 忽 袱 畔 起 了 我 呢 尚 穆 道 有 段 正 淳 嗎 朝 皇 此 也 羞 著 譚 婆 兒 子 我 沒 給 丐 姊 平 之 負 他 不 讓 徒 主 不 知 道 人 的 好 你 我 早 睛 兩 個 英 雄 母 殿 想 起