Natural Language Tool-Kits (NLTK)#
The almightly
nltk
package!
Install#
Install package in terminal
!pip install nltk
Download nltk data in python
import nltk
nltk.download('all', halt_on_error=False)
import nltk
# nltk.download('all', halt_on_error=False)
Corpora Data#
The package includes a lot of pre-loaded corpora datasets
The default
nltk_data
directory is in/Users/YOUT_NAME/nltk_data/
Selective Examples
Brown Corpus
Reuters Corpus
WordNet
from nltk.corpus import gutenberg, brown, reuters
# brown corpus
## Categories (topics?)
print('Brown Corpus Total Categories: ', len(brown.categories()))
print('Categories List: ', brown.categories())
Brown Corpus Total Categories: 15
Categories List: ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
# Sentences
print(brown.sents()[0]) ## first sentence
print(brown.sents(categories='fiction')) ## first sentence for fiction texts
['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.']
[['Thirty-three'], ['Scotty', 'did', 'not', 'go', 'back', 'to', 'school', '.'], ...]
## Tagged Sentences
print(brown.tagged_sents()[0])
[('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ('Grand', 'JJ-TL'), ('Jury', 'NN-TL'), ('said', 'VBD'), ('Friday', 'NR'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'IN'), ("Atlanta's", 'NP$'), ('recent', 'JJ'), ('primary', 'NN'), ('election', 'NN'), ('produced', 'VBD'), ('``', '``'), ('no', 'AT'), ('evidence', 'NN'), ("''", "''"), ('that', 'CS'), ('any', 'DTI'), ('irregularities', 'NNS'), ('took', 'VBD'), ('place', 'NN'), ('.', '.')]
## Sentence in natural forms
sents = brown.sents(categories='fiction')
[' '.join(sent) for sent in sents[1:5]]
['Scotty did not go back to school .',
'His parents talked seriously and lengthily to their own doctor and to a specialist at the University Hospital -- Mr. McKinley was entitled to a discount for members of his family -- and it was decided it would be best for him to take the remainder of the term off , spend a lot of time in bed and , for the rest , do pretty much as he chose -- provided , of course , he chose to do nothing too exciting or too debilitating .',
'His teacher and his school principal were conferred with and everyone agreed that , if he kept up with a certain amount of work at home , there was little danger of his losing a term .',
'Scotty accepted the decision with indifference and did not enter the arguments .']
## Get tagged words
tagged_words = brown.tagged_words(categories='fiction')
#print(tagged_words[1]) ## a tuple
## Get all nouns
nouns = [(word, tag) for word, tag in tagged_words
if any (noun_tag in tag for noun_tag in ['NP','NN'])]
## Check first ten nouns
nouns[:10]
[('Scotty', 'NP'),
('school', 'NN'),
('parents', 'NNS'),
('doctor', 'NN'),
('specialist', 'NN'),
('University', 'NN-TL'),
('Hospital', 'NN-TL'),
('Mr.', 'NP'),
('McKinley', 'NP'),
('discount', 'NN')]
## Creating Freq list
nouns_freq = nltk.FreqDist([w for w, t in nouns])
sorted(nouns_freq.items(),key=lambda x:x[1], reverse=True)[:20]
[('man', 111),
('time', 99),
('men', 72),
('room', 63),
('way', 62),
('eyes', 60),
('face', 55),
('house', 54),
('head', 54),
('night', 53),
('day', 52),
('hand', 50),
('door', 47),
('life', 44),
('years', 44),
('Mrs.', 41),
('God', 41),
('Kate', 40),
('Mr.', 39),
('people', 39)]
sorted(nouns_freq.items(),key=lambda x:x[0], reverse=True)[:20]
[('zoo', 2),
('zlotys', 1),
('zenith', 1),
('youth', 5),
('yelling', 1),
('years', 44),
('yearning', 1),
("year's", 1),
('year', 9),
('yards', 4),
('yard', 7),
('yachts', 1),
('writing', 2),
('writers', 1),
('writer', 4),
('wrists', 1),
('wrist', 2),
('wrinkles', 1),
('wrinkle', 1),
('wretch', 1)]
nouns_freq.most_common(10)
[('man', 111),
('time', 99),
('men', 72),
('room', 63),
('way', 62),
('eyes', 60),
('face', 55),
('house', 54),
('head', 54),
('night', 53)]
## Accsess data via fileid
brown.fileids(categories='fiction')[0]
brown.sents(fileids='ck01')
[['Thirty-three'], ['Scotty', 'did', 'not', 'go', 'back', 'to', 'school', '.'], ...]
WordNet#
A dictionary resource
from nltk.corpus import wordnet as wn
word = 'walk'
# get synsets
word_synsets = wn.synsets(word)
word_synsets
[Synset('walk.n.01'),
Synset('base_on_balls.n.01'),
Synset('walk.n.03'),
Synset('walk.n.04'),
Synset('walk.n.05'),
Synset('walk.n.06'),
Synset('walk_of_life.n.01'),
Synset('walk.v.01'),
Synset('walk.v.02'),
Synset('walk.v.03'),
Synset('walk.v.04'),
Synset('walk.v.05'),
Synset('walk.v.06'),
Synset('walk.v.07'),
Synset('walk.v.08'),
Synset('walk.v.09'),
Synset('walk.v.10')]
## Get details of each synset
for s in word_synsets:
if str(s.name()).startswith('walk.v'):
print(
'Syset ID: %s \n'
'POS Tag: %s \n'
'Definition: %s \n'
'Examples: %s \n' % (s.name(), s.pos(), s.definition(),s.examples())
)
Syset ID: walk.v.01
POS Tag: v
Definition: use one's feet to advance; advance by steps
Examples: ["Walk, don't run!", 'We walked instead of driving', 'She walks with a slight limp', 'The patient cannot walk yet', 'Walk over to the cabinet']
Syset ID: walk.v.02
POS Tag: v
Definition: accompany or escort
Examples: ["I'll walk you to your car"]
Syset ID: walk.v.03
POS Tag: v
Definition: obtain a base on balls
Examples: []
Syset ID: walk.v.04
POS Tag: v
Definition: traverse or cover by walking
Examples: ['Walk the tightrope', 'Paul walked the streets of Damascus', 'She walks 3 miles every day']
Syset ID: walk.v.05
POS Tag: v
Definition: give a base on balls to
Examples: []
Syset ID: walk.v.06
POS Tag: v
Definition: live or behave in a specified manner
Examples: ['walk in sadness']
Syset ID: walk.v.07
POS Tag: v
Definition: be or act in association with
Examples: ['We must walk with our dispossessed brothers and sisters', 'Walk with God']
Syset ID: walk.v.08
POS Tag: v
Definition: walk at a pace
Examples: ['The horses walked across the meadow']
Syset ID: walk.v.09
POS Tag: v
Definition: make walk
Examples: ['He walks the horse up the mountain', 'Walk the dog twice a day']
Syset ID: walk.v.10
POS Tag: v
Definition: take a walk; go for a walk; walk for pleasure
Examples: ['The lovers held hands while walking', 'We like to walk every Sunday']