Dov2Vec#

  • An extension of Word2Vec

  • Convert a document into a vector representation of a fix-sized numeric values

TaggedDocument Preparation#

import os, gensim
# LEE corpus
test_data_dir = '{}'.format(os.sep).join([gensim.__path__[0], 'test', 'test_data'])
lee_train_file = test_data_dir + os.sep + 'lee_background.cor'
lee_test_file = test_data_dir + os.sep + 'lee.cor'

print(test_data_dir)
print(lee_train_file)
print(lee_test_file)
/Users/Alvin/opt/anaconda3/envs/python-notes/lib/python3.7/site-packages/gensim/test/test_data
/Users/Alvin/opt/anaconda3/envs/python-notes/lib/python3.7/site-packages/gensim/test/test_data/lee_background.cor
/Users/Alvin/opt/anaconda3/envs/python-notes/lib/python3.7/site-packages/gensim/test/test_data/lee.cor
import smart_open

def read_corpus(file_name, tokens_only=False):
    with smart_open.smart_open(file_name) as f:
        for i, line in enumerate(f):
            if tokens_only:
                yield gensim.utils.simple_preprocess(line)
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line), [i])

train_corpus = list(read_corpus(lee_train_file))
test_corpus = list(read_corpus(lee_test_file, tokens_only=True))
/Users/Alvin/opt/anaconda3/envs/python-notes/lib/python3.7/site-packages/smart_open/smart_open_lib.py:252: UserWarning: This function is deprecated, use smart_open.open instead. See the migration notes for details: https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst#migrating-to-the-new-open-function
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL

TaggedDocument Format#

  • TaggedDocument(words = List(toke, token,…), tags = int())

train_corpus[2]

## A TaggedDocument(List of Word Tokens, Int of Tag)
TaggedDocument(words=['the', 'national', 'road', 'toll', 'for', 'the', 'christmas', 'new', 'year', 'holiday', 'period', 'stands', 'at', 'eight', 'fewer', 'than', 'for', 'the', 'same', 'time', 'last', 'year', 'people', 'have', 'died', 'on', 'new', 'south', 'wales', 'roads', 'with', 'eight', 'fatalities', 'in', 'both', 'queensland', 'and', 'victoria', 'western', 'australia', 'the', 'northern', 'territory', 'and', 'south', 'australia', 'have', 'each', 'recorded', 'three', 'deaths', 'while', 'the', 'act', 'and', 'tasmania', 'remain', 'fatality', 'free'], tags=[2])

Model Training#

%%time
from gensim.models import Doc2Vec
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=100)
model.build_vocab(train_corpus) 
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.iter)

models = [
    # PV-DBOW (Skip-Gram equivalent of Word2Vec)
    Doc2Vec(dm=0, dbow_words=1, vector_size=200, window=8, min_count=10, epochs=50),
    
    # PV-DM w/average (CBOW equivalent of Word2Vec)
    Doc2Vec(dm=1, dm_mean=1, vector_size=200, window=8, min_count=10, epochs =50),
]
/Users/Alvin/.local/lib/python3.7/site-packages/ipykernel_launcher.py:4: DeprecationWarning: Call to deprecated `iter` (Attribute will be removed in 4.0.0, use self.epochs instead).
  after removing the cwd from sys.path.
CPU times: user 10.6 s, sys: 714 ms, total: 11.4 s
Wall time: 6.32 s

Concatenated Model#

## Train both PV-DBOW and PV-DM and combine the two

documents = train_corpus
models[0].build_vocab(documents)
models[1].reset_from(models[0])

for model in models:
   model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
new_model = ConcatenatedDoc2Vec((models[0], models[1]))
inferred_vector = model.infer_vector(train_corpus[0].words)
sims = model.docvecs.most_similar([inferred_vector])
print(sims)
[(0, 0.9365277290344238), (48, 0.8249694108963013), (255, 0.8241095542907715), (40, 0.7847284078598022), (272, 0.7842742204666138), (8, 0.7567874193191528), (264, 0.7083355188369751), (33, 0.7043914198875427), (19, 0.6420626640319824), (10, 0.6348395943641663)]

Note

A thread on how to use most_similar() with ConcatenatedDoc2Vec: link

# model 1
inferred_vector =new_model.models[0].infer_vector(train_corpus[0].words)
sims2 = new_model.models[0].docvecs.most_similar([inferred_vector])
print(sims2)
# model 2
inferred_vector =new_model.models[1].infer_vector(train_corpus[0].words)
sims3 = new_model.models[1].docvecs.most_similar([inferred_vector])
print(sims3)
[(0, 0.9334837198257446), (33, 0.4757263660430908), (40, 0.44081586599349976), (48, 0.43202292919158936), (189, 0.3894966244697571), (264, 0.36887627840042114), (46, 0.36355406045913696), (8, 0.36284035444259644), (53, 0.3596709966659546), (255, 0.346430242061615)]
[(0, 0.9470268487930298), (48, 0.8333772420883179), (255, 0.8190398216247559), (40, 0.7976764440536499), (272, 0.7799839973449707), (8, 0.7639546394348145), (264, 0.7157275676727295), (33, 0.7005429267883301), (19, 0.6571251153945923), (10, 0.6412277817726135)]
## Doc 1 seems most similar to Doc 255?
print(' '.join(train_corpus[0][0])+'\n')
print(' '.join(train_corpus[255][0])+'\n')
print(' '.join(train_corpus[33][0])+'\n')
hundreds of people have been forced to vacate their homes in the southern highlands of new south wales as strong winds today pushed huge bushfire towards the town of hill top new blaze near goulburn south west of sydney has forced the closure of the hume highway at about pm aedt marked deterioration in the weather as storm cell moved east across the blue mountains forced authorities to make decision to evacuate people from homes in outlying streets at hill top in the new south wales southern highlands an estimated residents have left their homes for nearby mittagong the new south wales rural fire service says the weather conditions which caused the fire to burn in finger formation have now eased and about fire units in and around hill top are optimistic of defending all properties as more than blazes burn on new year eve in new south wales fire crews have been called to new fire at gunning south of goulburn while few details are available at this stage fire authorities says it has closed the hume highway in both directions meanwhile new fire in sydney west is no longer threatening properties in the cranebrook area rain has fallen in some parts of the illawarra sydney the hunter valley and the north coast but the bureau of meteorology claire richards says the rain has done little to ease any of the hundred fires still burning across the state the falls have been quite isolated in those areas and generally the falls have been less than about five millimetres she said in some places really not significant at all less than millimetre so there hasn been much relief as far as rain is concerned in fact they ve probably hampered the efforts of the firefighters more because of the wind gusts that are associated with those thunderstorms

the new south wales state emergency service ses says it has now received calls for help in the wake of monday fierce storms natural disaster areas have been declared throughout sydney and surrounding areas and parts of the state north west in sydney more than homes mainly in the northern suburbs remain without power ses spokeswoman laura goodin says several hundred volunteers will be back in the field this morning we ve had about calls for help of which we ve completed about two thirds we ve had about volunteers in the field being helped out by the royal fire service and the new south wales fire brigades and we re expecting to have most jobs completed by about friday ms goodin said the extensive storm damage has prompted warning about people falsely claiming to work for the ses the warning from fair trading minister john aquilina follows reports from the suburb of hornsby that people claiming to work for the ses are asking for payment from the storm victims mr aquilina has reminded householders that the ses is volunteer organisation and does not charge for its work or employ sub contractors he has suggested residents contact the police if they are approached by such people the government is also warning householders against dealing with unlicensed tradespeople

new south wales firefighters are hoping lighter winds will help ease their workload today but are predicting nasty conditions over the weekend while the winds are expected to ease somewhat today the weather bureau says temperatures will be higher more than fires are still burning across new south wales the rural fire service says the change may allow it to concentrate more on preventative action but there is no room for complacency mark sullivan from the rural fire service says while conditions may be little kinder to them today the outlook for the weekend has them worried it certainly appears from the weather forecast with very high temperatures and high winds that it certainly could be nasty couple of days ahead mr sullivan said one of the areas causing greatest concern today is the kilometre long blaze in the lower blue mountains firefighters are also keeping close eye on blaze at spencer north of sydney which yesterday broke through containment lines there are concerns that fire may jump the hawkesbury river backburning continues in the state central west and south of sydney in the shoalhaven in the illawarra firefighters have been able to carry out back burning operations in three areas operations were carried out in parts of mt kembla as well as an area bounded by appin road and the old princes highway at helensburgh an area west of windy gully near cataract dam was also targeted meanwhile illawarra police have arrested three teenagers in relation to bushfires at shellharbour on the south coast of new south wales spokesman says three small fires were extinguished around pm aedt yesterday short time later police arrested three year old boys from shellharbour barrack heights and shell cove all three have been interviewed but no charges have been laid
## Other vector models 

# # glove

# from gensim.scripts.glove2word2vec import glove2word2vec
# glove_input_file = 'glove.6B.100d.txt'
# word2vec_output_file = 'glove.6B.100d.txt.word2vec'
# glove2word2vec(glove_input_file, word2vec_output_file)

# from gensim.models import KeyedVectors
# filename = 'glove.6B.100d.txt.word2vec'
# model = KeyedVectors.load_word2vec_format(filename, binary=False)

# model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)

# from gensim.models.fasttext import FastText

# ft_model = FastText(size=100)
# ft_model.build_vocab(data)
# model_gensim.train(data, total_examples=ft_model.corpus_count, epochs=ft_model.iter)


# from gensim.models.wrappers.fasttext import FastText

# # Set FastText home to the path to the FastText executable
# ft_home = '/home/bhargav/Gensim/fastText/fasttext'
# # train the model
# model_wrapper = FastText.train(ft_home, train_file)

# print('dog' in model.wv.vocab)
# print('dogs' in model.wv.vocab)

# print('dog' in model)
# print('dogs' in model)

# from gensim.models.wrappers import Wordrank

# wordrank_path = 'wordrank' # path to Wordrank directory
# out_dir = 'model' # name of output directory to save data to
# data = '../../gensim/test/test_data/lee.cor' # sample corpus

# model = Wordrank.train(wordrank_path, data, out_dir, iter=21, dump_period=10)


# varembed_vectors = '../../gensim/test/test_data/varembed_leecorpus_vectors.pkl'
# model = varembed.VarEmbed.load_varembed_format(vectors=varembed_vectors)


# morfessors = '../../gensim/test/test_data/varembed_leecorpus_morfessor.bin'
# model = varembed.VarEmbed.load_varembed_format(vectors=varembed_vectors, morfessor_model=morfessors)

# import os

# poincare_directory = os.path.join(os.getcwd(), 'docs', 'notebooks', 'poincare')
# data_directory = os.path.join(poincare_directory, 'data')
# wordnet_mammal_file = os.path.join(data_directory, 'wordnet_mammal_hypernyms.tsv')

# from gensim.models.poincare import PoincareModel, PoincareKeyedVectors, PoincareRelations
# relations = PoincareRelations(file_path=wordnet_mammal_file, delimiter='\t')
# model = PoincareModel(train_data=relations, size=2, burn_in=0)
# model.train(epochs=1, print_every=500)

# models_directory = os.path.join(poincare_directory, 'models')
# test_model_path = os.path.join(models_directory, 'gensim_model_batch_size_10_burn_in_0_epochs_50_neg_20_dim_50')
# model = PoincareModel.load(test_model_path)