Natural Language Tool-Kits (NLTK)#

  • The almightly nltk package!

Install#

  • Install package in terminal

!pip install nltk
  • Download nltk data in python

import nltk
nltk.download('all', halt_on_error=False)
import nltk
# nltk.download('all', halt_on_error=False)

Corpora Data#

  • The package includes a lot of pre-loaded corpora datasets

  • The default nltk_data directory is in /Users/YOUT_NAME/nltk_data/

  • Selective Examples

    • Brown Corpus

    • Reuters Corpus

    • WordNet

from nltk.corpus import gutenberg, brown, reuters

# brown corpus
## Categories (topics?)
print('Brown Corpus Total Categories: ', len(brown.categories()))
print('Categories List: ', brown.categories())
Brown Corpus Total Categories:  15
Categories List:  ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
# Sentences
print(brown.sents()[0]) ## first sentence
print(brown.sents(categories='fiction')) ## first sentence for fiction texts
['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.']
[['Thirty-three'], ['Scotty', 'did', 'not', 'go', 'back', 'to', 'school', '.'], ...]
## Tagged Sentences
print(brown.tagged_sents()[0])
[('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ('Grand', 'JJ-TL'), ('Jury', 'NN-TL'), ('said', 'VBD'), ('Friday', 'NR'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'IN'), ("Atlanta's", 'NP$'), ('recent', 'JJ'), ('primary', 'NN'), ('election', 'NN'), ('produced', 'VBD'), ('``', '``'), ('no', 'AT'), ('evidence', 'NN'), ("''", "''"), ('that', 'CS'), ('any', 'DTI'), ('irregularities', 'NNS'), ('took', 'VBD'), ('place', 'NN'), ('.', '.')]
## Sentence in natural forms
sents = brown.sents(categories='fiction')
[' '.join(sent) for sent in sents[1:5]]
['Scotty did not go back to school .',
 'His parents talked seriously and lengthily to their own doctor and to a specialist at the University Hospital -- Mr. McKinley was entitled to a discount for members of his family -- and it was decided it would be best for him to take the remainder of the term off , spend a lot of time in bed and , for the rest , do pretty much as he chose -- provided , of course , he chose to do nothing too exciting or too debilitating .',
 'His teacher and his school principal were conferred with and everyone agreed that , if he kept up with a certain amount of work at home , there was little danger of his losing a term .',
 'Scotty accepted the decision with indifference and did not enter the arguments .']
## Get tagged words
tagged_words = brown.tagged_words(categories='fiction')

#print(tagged_words[1]) ## a tuple

## Get all nouns 
nouns = [(word, tag) for word, tag in tagged_words 
                      if any (noun_tag in tag for noun_tag in ['NP','NN'])]
## Check first ten nouns
nouns[:10]
[('Scotty', 'NP'),
 ('school', 'NN'),
 ('parents', 'NNS'),
 ('doctor', 'NN'),
 ('specialist', 'NN'),
 ('University', 'NN-TL'),
 ('Hospital', 'NN-TL'),
 ('Mr.', 'NP'),
 ('McKinley', 'NP'),
 ('discount', 'NN')]
## Creating Freq list
nouns_freq = nltk.FreqDist([w for w, t in nouns])
sorted(nouns_freq.items(),key=lambda x:x[1], reverse=True)[:20]
[('man', 111),
 ('time', 99),
 ('men', 72),
 ('room', 63),
 ('way', 62),
 ('eyes', 60),
 ('face', 55),
 ('house', 54),
 ('head', 54),
 ('night', 53),
 ('day', 52),
 ('hand', 50),
 ('door', 47),
 ('life', 44),
 ('years', 44),
 ('Mrs.', 41),
 ('God', 41),
 ('Kate', 40),
 ('Mr.', 39),
 ('people', 39)]
sorted(nouns_freq.items(),key=lambda x:x[0], reverse=True)[:20]
[('zoo', 2),
 ('zlotys', 1),
 ('zenith', 1),
 ('youth', 5),
 ('yelling', 1),
 ('years', 44),
 ('yearning', 1),
 ("year's", 1),
 ('year', 9),
 ('yards', 4),
 ('yard', 7),
 ('yachts', 1),
 ('writing', 2),
 ('writers', 1),
 ('writer', 4),
 ('wrists', 1),
 ('wrist', 2),
 ('wrinkles', 1),
 ('wrinkle', 1),
 ('wretch', 1)]
nouns_freq.most_common(10)
[('man', 111),
 ('time', 99),
 ('men', 72),
 ('room', 63),
 ('way', 62),
 ('eyes', 60),
 ('face', 55),
 ('house', 54),
 ('head', 54),
 ('night', 53)]
## Accsess data via fileid
brown.fileids(categories='fiction')[0]
brown.sents(fileids='ck01')
[['Thirty-three'], ['Scotty', 'did', 'not', 'go', 'back', 'to', 'school', '.'], ...]

WordNet#

  • A dictionary resource

from nltk.corpus import wordnet as wn
word = 'walk'

# get synsets
word_synsets = wn.synsets(word)
word_synsets
[Synset('walk.n.01'),
 Synset('base_on_balls.n.01'),
 Synset('walk.n.03'),
 Synset('walk.n.04'),
 Synset('walk.n.05'),
 Synset('walk.n.06'),
 Synset('walk_of_life.n.01'),
 Synset('walk.v.01'),
 Synset('walk.v.02'),
 Synset('walk.v.03'),
 Synset('walk.v.04'),
 Synset('walk.v.05'),
 Synset('walk.v.06'),
 Synset('walk.v.07'),
 Synset('walk.v.08'),
 Synset('walk.v.09'),
 Synset('walk.v.10')]
## Get details of each synset
for s in word_synsets:
    if str(s.name()).startswith('walk.v'):
        print(
            'Syset ID: %s \n'
            'POS Tag: %s \n'
            'Definition: %s \n'
            'Examples: %s \n' % (s.name(), s.pos(), s.definition(),s.examples())
        )
Syset ID: walk.v.01 
POS Tag: v 
Definition: use one's feet to advance; advance by steps 
Examples: ["Walk, don't run!", 'We walked instead of driving', 'She walks with a slight limp', 'The patient cannot walk yet', 'Walk over to the cabinet'] 

Syset ID: walk.v.02 
POS Tag: v 
Definition: accompany or escort 
Examples: ["I'll walk you to your car"] 

Syset ID: walk.v.03 
POS Tag: v 
Definition: obtain a base on balls 
Examples: [] 

Syset ID: walk.v.04 
POS Tag: v 
Definition: traverse or cover by walking 
Examples: ['Walk the tightrope', 'Paul walked the streets of Damascus', 'She walks 3 miles every day'] 

Syset ID: walk.v.05 
POS Tag: v 
Definition: give a base on balls to 
Examples: [] 

Syset ID: walk.v.06 
POS Tag: v 
Definition: live or behave in a specified manner 
Examples: ['walk in sadness'] 

Syset ID: walk.v.07 
POS Tag: v 
Definition: be or act in association with 
Examples: ['We must walk with our dispossessed brothers and sisters', 'Walk with God'] 

Syset ID: walk.v.08 
POS Tag: v 
Definition: walk at a pace 
Examples: ['The horses walked across the meadow'] 

Syset ID: walk.v.09 
POS Tag: v 
Definition: make walk 
Examples: ['He walks the horse up the mountain', 'Walk the dog twice a day'] 

Syset ID: walk.v.10 
POS Tag: v 
Definition: take a walk; go for a walk; walk for pleasure 
Examples: ['The lovers held hands while walking', 'We like to walk every Sunday']