Corpus Lingustics Methods#
With
nltk
, we can easily implement quite a few corpus-linguistic methodsConcordance Analysis (Simple Word Search)
Frequency Lists
Collocations
Data Analysis with R
Concordance Analysis (Patterns, Constructions?)
Patterns on sentence strings
Patterns on sentence word-tag strings
Preparing Corpus Data#
import nltk
from nltk.corpus import brown
from nltk.text import Text
import pandas as pd
import numpy as np
brown_text = Text(brown.words())
Collocations#
Documentation nltk.collocations
nltk.collocations
: Get theBigramCollocationFinder
which we can use to find n-gramsnltk.metrics
: Get theBigramAssocMeasures
to define collocations (It’s also available innltk.collocations
)Use
finder.nbest()
methods to select/filter collocations
## Collocations based on Text
brown_text.collocation_list()[:10]
#brown_text.collocations()
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder
bigram_measures = nltk.collocations.BigramAssocMeasures() # measures
finder = BigramCollocationFinder.from_words(brown.words()) # finders
## bigram collocations based on different association measures
finder.nbest(bigram_measures.likelihood_ratio,10)
finder.nbest(bigram_measures.pmi, 10)
[('$10,000-per-year', 'French-born'),
('$79.89', 'nothing-down'),
('$8.50', 'tab'),
("'low", 'nigras'),
('0.5-mv./m.', '50-percent'),
('0.78', 'mEq'),
('1,100', 'circumscriptions'),
('1,257,700', 'non-farm'),
('11-inch', 'headroom'),
('11-shot', 'hammerless')]
Apply Filters#
We can create an anonymous function as a helper to remove irrelevant word tokens before collocation computation.
For example, we remove:
word tokens whose char length < 3
word tokens that belong to the stopwords
word tokens that include at least one non-alphabetic char
## Apply freq-based filers for bigram collocations
finder.apply_freq_filter(10)
## Apply word filer function
from nltk.corpus import stopwords
stop_words_en = stopwords.words('english')
filter_stops = lambda w: len(w)<3 or w in stop_words_en or not w.isalpha()
finder.apply_word_filter(filter_stops) # filter on word tokens
finder.apply_freq_filter(10) # filter on bigram min frequencies
finder.nbest(bigram_measures.likelihood_ratio, 10)
finder.nbest(bigram_measures.pmi, 10)
[('Hong', 'Kong'),
('Viet', 'Nam'),
('Pathet', 'Lao'),
('Simms', 'Purdew'),
('Internal', 'Revenue'),
('Puerto', 'Rico'),
('Saxon', 'Shore'),
('carbon', 'tetrachloride'),
('unwed', 'mothers'),
('Armed', 'Forces')]
POS Collocations#
## Create collcoations based on tags only
finder = BigramCollocationFinder.from_words(
t for w, t in brown.tagged_words(tagset='universal') if t != 'X')
finder.nbest(bigram_measures.likelihood_ratio, 10)
[('ADP', 'DET'),
('DET', 'NOUN'),
('PRON', 'VERB'),
('ADJ', 'NOUN'),
('NOUN', '.'),
('NOUN', 'DET'),
('DET', 'ADJ'),
('NOUN', 'ADP'),
('PRT', 'VERB'),
('ADP', '.')]
Collocations based on Skipped Bigrams#
## Create collocations with intervneing words (gapped n-grams)
finder = BigramCollocationFinder.from_words(brown.words(), window_size=2)
finder.apply_word_filter(filter_stops)
finder.apply_freq_filter(10)
finder.nbest(bigram_measures.likelihood_ratio, 10)
[('United', 'States'),
('New', 'York'),
('per', 'cent'),
('Rhode', 'Island'),
('years', 'ago'),
('Los', 'Angeles'),
('White', 'House'),
('Peace', 'Corps'),
('World', 'War'),
('San', 'Francisco')]
Scoring Ngrams#
## Finders
scored = finder.score_ngrams(bigram_measures.raw_freq)
scored[:10]
[(('United', 'States'), 0.0003375841376792124),
(('New', 'York'), 0.00025491047130879306),
(('per', 'cent'), 0.00012573286760501277),
(('years', 'ago'), 0.0001171210273580941),
(('The', 'first'), 8.267366637041936e-05),
(('Rhode', 'Island'), 7.750656222226816e-05),
(('could', 'see'), 7.492301014819255e-05),
(('last', 'year'), 5.856051367904705e-05),
(('first', 'time'), 5.769932965435518e-05),
(('White', 'House'), 5.5976961604971446e-05)]
scored = finder.above_score(bigram_measures.pmi, min_score = 15)
for s in scored:
print(s)
('Hong', 'Kong')
('Viet', 'Nam')
('Pathet', 'Lao')
('Simms', 'Purdew')
('Internal', 'Revenue')
('Puerto', 'Rico')
('Saxon', 'Shore')
('carbon', 'tetrachloride')
('unwed', 'mothers')
Retrieve n-grams#
Note
The *
is the unpack the nested list and take each element as the input of the function call.
So we can use *
to unlist a nested list. (Similar to unlist()
in R.
def compute_ngrams(sequence, n):
return list(zip(*(sequence[index:] for index in range(n))))
print(compute_ngrams([1,2,3,4], 2))
print(compute_ngrams([1,2,3,4,5],3))
[(1, 2), (2, 3), (3, 4)]
[(1, 2, 3), (2, 3, 4), (3, 4, 5)]
Dispersion#
Dispersion of a linguistic unit is also important.
There should be a metric that indicates how evenly distributed the linguistic unit is.
Note
How to get the document frequency of the bigrams???
unigram_freq = nltk.FreqDist(brown.words())
bigram_freq = nltk.FreqDist('_'.join(x) for x in nltk.bigrams(brown.words()))
# ngram freq list of each file in the corpus
unigram_freq_per_file = [nltk.FreqDist(words)
for words in [brown.words(fileids=f) for f in brown.fileids()]]
bigram_freq_per_file = [nltk.FreqDist('_'.join(x) for x in nltk.bigrams(words))
for words in [brown.words(fileids=f) for f in brown.fileids()]]
## Function to get unigram dispersion
def createDipsersionDist(uni_freq, uni_freq_per_file):
len(uni_freq_per_file)
unigram_dispersion = {}
for fid in uni_freq_per_file:
for w, f in fid.items():
if w in unigram_dispersion:
unigram_dispersion[w] += 1
else:
unigram_dispersion[w] = 1
return(unigram_dispersion)
unigram_dispersion = createDipsersionDist(unigram_freq, unigram_freq_per_file)
# Dictionary cannot be sliced/subset
# Get the items() and convert to list for subsetting
list(unigram_dispersion.items())[:20]
[('The', 500),
('Fulton', 3),
('County', 45),
('Grand', 17),
('Jury', 4),
('said', 314),
('Friday', 34),
('an', 498),
('investigation', 34),
('of', 500),
("Atlanta's", 2),
('recent', 114),
('primary', 59),
('election', 28),
('produced', 66),
('``', 462),
('no', 455),
('evidence', 119),
("''", 463),
('that', 500)]
#dict(sorted(bigram_freq.items()[:3]))
list(bigram_freq.items())[:20]
[('The_Fulton', 1),
('Fulton_County', 6),
('County_Grand', 1),
('Grand_Jury', 2),
('Jury_said', 1),
('said_Friday', 4),
('Friday_an', 1),
('an_investigation', 7),
('investigation_of', 15),
("of_Atlanta's", 1),
("Atlanta's_recent", 1),
('recent_primary', 1),
('primary_election', 2),
('election_produced', 1),
('produced_``', 1),
('``_no', 6),
('no_evidence', 14),
("evidence_''", 3),
("''_that", 16),
('that_any', 31)]
bigram_dispersion = createDipsersionDist(bigram_freq, bigram_freq_per_file)
list(bigram_dispersion.items())[:20]
[('The_Fulton', 1),
('Fulton_County', 1),
('County_Grand', 1),
('Grand_Jury', 2),
('Jury_said', 1),
('said_Friday', 3),
('Friday_an', 1),
('an_investigation', 7),
('investigation_of', 14),
("of_Atlanta's", 1),
("Atlanta's_recent", 1),
('recent_primary', 1),
('primary_election', 2),
('election_produced', 1),
('produced_``', 1),
('``_no', 6),
('no_evidence', 12),
("evidence_''", 3),
("''_that", 16),
('that_any', 30)]
type(unigram_freq)
type(unigram_dispersion)
dict
Note
We can implement the Delta P dispersion metric proposed by Gries (2008).
Delta P#
This is a directional association metric.
## Inherit BigramAssocMeasures
class AugmentedBigramAssocMeasures(BigramAssocMeasures):
@classmethod
def raw_freq2(cls,*marginals):
"""Scores ngrams by their frequency"""
n_ii, n_io, n_oi, n_oo = cls._contingency(*marginals)
return n_ii
@classmethod
def dp_fwd(cls, *marginals):
"""Scores bigrams using DP forward
This may be shown with respect to a contingency table::
w1 ~w1
------ ------
w2 | n_ii | n_oi | = n_xi
------ ------
~w2 | n_io | n_oo |
------ ------
= n_ix TOTAL = n_xx
"""
n_ii, n_oi, n_io, n_oo = cls._contingency(*marginals)
return (n_ii/(n_ii+n_io)) - (n_oi/(n_oi+n_oo))
@classmethod
def dp_bwd(cls, *marginals):
"""Scores bigrams using DP backward
This may be shown with respect to a contingency table::
w1 ~w1
------ ------
w2 | n_ii | n_oi | = n_xi
------ ------
~w2 | n_io | n_oo |
------ ------
= n_ix TOTAL = n_xx
"""
n_ii, n_oi, n_io, n_oo = cls._contingency(*marginals)
return (n_ii/(n_ii+n_oi)) - (n_io/(n_io+n_oo))
bigram_measures = AugmentedBigramAssocMeasures()
finder = BigramCollocationFinder.from_words(brown.words())
#finder.apply_freq_filter(10)
bigrams_dpfwd = finder.score_ngrams(bigram_measures.dp_fwd)
bigrams_dpfwd[:10]
[(('$10,000-per-year', 'French-born'), 1.0),
(('$79.89', 'nothing-down'), 1.0),
(('$8.50', 'tab'), 1.0),
(("'low", 'nigras'), 1.0),
(('0.5-mv./m.', '50-percent'), 1.0),
(('0.78', 'mEq'), 1.0),
(('1,100', 'circumscriptions'), 1.0),
(('1,257,700', 'non-farm'), 1.0),
(('11-inch', 'headroom'), 1.0),
(('11-shot', 'hammerless'), 1.0)]
bigrams_dpbwd = finder.score_ngrams(bigram_measures.dp_bwd)
bigrams_dpbwd[:10]
[(('$10,000-per-year', 'French-born'), 1.0),
(('$79.89', 'nothing-down'), 1.0),
(('$8.50', 'tab'), 1.0),
(("'low", 'nigras'), 1.0),
(('0.5-mv./m.', '50-percent'), 1.0),
(('0.78', 'mEq'), 1.0),
(('1,100', 'circumscriptions'), 1.0),
(('1,257,700', 'non-farm'), 1.0),
(('11-inch', 'headroom'), 1.0),
(('11-shot', 'hammerless'), 1.0)]
Checking Computation Accuracy#
Check if DP is correctly computed.
bigrams_rawfreq = finder.score_ngrams(bigram_measures.raw_freq2)
bigrams_rawfreq[:10]
[(('of', 'the'), 9625.0),
((',', 'and'), 6288.0),
(('.', 'The'), 6081.0),
(('in', 'the'), 5546.0),
((',', 'the'), 3754.0),
(('.', '``'), 3515.0),
(('to', 'the'), 3426.0),
(("''", '.'), 3332.0),
((';', ';'), 2784.0),
(('.', 'He'), 2660.0)]
unigrams_rawfreq = nltk.FreqDist(brown.words())
w1f = unigrams_rawfreq['of']
w2f = unigrams_rawfreq['the']
w1w2 = [freq for (w1,w2),freq in bigrams_rawfreq if w1=="of" and w2=="the"][0]
corpus_size = np.sum(list(unigrams_rawfreq.values()))
"""
w1 _w1
w2 w1w2 ____ w2f
_w2 ____ ____
w1f corpus_size
"""
print(w1f, w2f, w1w2,corpus_size)
36080 62713 9625.0 1161192
print('Delta P Forward for `of the`:', (w1w2/(w1f))-((w2f-w1w2)/(corpus_size-w1f)))
print('Delta P Backward for `of the`:', (w1w2/(w2f))-((w1f-w1w2)/(corpus_size-w2f)))
Delta P Forward for `of the`: 0.2195836568422283
Delta P Backward for `of the`: 0.12939364991590951
print([dp for (w1, w2),dp in bigrams_dpfwd if w1=="of" and w2=="the"])
print([dp for (w1, w2),dp in bigrams_dpbwd if w1=="of" and w2=="the"])
[0.2195836568422283]
[0.12939364991590951]
Note
How to implement the delta P of trigrams?
# inherit Trigram
from nltk.collocations import TrigramAssocMeasures, TrigramCollocationFinder
class AugmentedTrigramAssocMeasures(TrigramAssocMeasures):
"""
A collection of trigram association measures. Each association measure
is provided as a function with four arguments::
trigram_score_fn(n_iii,
(n_iix, n_ixi, n_xii),
(n_ixx, n_xix, n_xxi),
n_xxx)
The arguments constitute the marginals of a contingency table, counting
the occurrences of particular events in a corpus. The letter i in the
suffix refers to the appearance of the word in question, while x indicates
the appearance of any word. Thus, for example:
n_iii counts (w1, w2, w3), i.e. the trigram being scored
n_ixx counts (w1, *, *)
n_xxx counts (*, *, *), i.e. any trigram
"""
@classmethod
def dp_fwd(cls, *marginals):
"""
Scores trigrams using delta P forward, i.e. conditional prob of w3 given w1,w2
minus conditional prob of w3, in the absence of w1,w2
"""
n_iii, n_oii, n_ioi, n_ooi, n_iio, n_oio, n_ioo, n_ooo = cls._contingency(*marginals)
return ((n_iii)/(n_iii+n_iio)) - ((n_ooi)/(n_ooi+n_ooo))
@classmethod
def dp_bwd(cls, *marginals):
"""
Scores trigrams using delta P backward, i.e. conditional prob of w1 given w2,w3
minus conditional prob of w1, in the absence of w2,w3
"""
n_iii, n_oii, n_ioi, n_ooi, n_iio, n_oio, n_ioo, n_ooo = cls._contingency(*marginals)
return ((n_iii)/(n_iii+n_oii)) - ((n_ioo)/(n_ioo+n_ooo))
trigram_measures = AugmentedTrigramAssocMeasures()
finder3 = TrigramCollocationFinder.from_words(brown.words())
finder3.apply_freq_filter(10)
finder3.nbest(trigram_measures.pmi, 10)
[("Drug's", 'chemical', 'name'),
('Brown', '&', 'Sharpe'),
('B.', '&', 'O.'),
('per', 'capita', 'income'),
('John', 'A.', 'Notte'),
('average', 'per', 'capita'),
('General', 'Motors', 'stock'),
('basic', 'wage', 'rate'),
('World', 'War', '2'),
('New', 'York', 'Times')]
finder3.nbest(trigram_measures.dp_fwd, 10)
[('the', 'Lo', 'Shu'),
('average', 'per', 'capita'),
('of', 'Economic', 'Affairs'),
('the', 'minimal', 'polynomial'),
('B.', '&', 'O.'),
('the', 'Export-Import', 'Bank'),
('Chamber', 'of', 'Commerce'),
('Notte', ',', 'Jr.'),
('.', "Drug's", 'chemical'),
('v.', 'United', 'States')]
finder3.nbest(trigram_measures.dp_bwd,10)
[('Puerto', 'Rico', ','),
('Los', 'Angeles', ','),
('dominant', 'stress', 'will'),
('couple', 'of', 'weeks'),
('A.', 'Notte', ','),
('United', 'States', 'is'),
('Brown', '&', 'Sharpe'),
('Department', 'of', 'Economic'),
('boys', 'and', 'girls'),
('General', 'Motors', 'stock')]
Concordance#
## Simple Concordances
brown_text.concordance('American', width=79, lines = 5)
Displaying 5 of 569 matches:
will deliver tomorrow night to the American people over nationwide television
ocial security taxes on 70 million American workers would be raised to pay the
o retired as vice president of the American Screw Co. in 1955 said , `` Both p
wice elected overwhelmingly by the American people as president of the United
n example : Last month in Ghana an American missionary discovered when he came
#nltk.app.concordance()
## Regular Expression Concordances
import re
sents = [' '.join(s) for s in brown.sents()]
regex_1 = r'(is|was) \w+ing'
targets = [sent for sent in sents[:100] if re.search(regex_1, sent)]
targets[0]
#if targets:
# for match in targets:
# print(match.strip())
"The City Purchasing Department , the jury said , `` is lacking in experienced clerical personnel as a result of city personnel policies '' ."
Frequency List#
## word frequencies
brown_fd_words = nltk.FreqDist(brown.words())
brown_fd_words.most_common(10)
[('the', 62713),
(',', 58334),
('.', 49346),
('of', 36080),
('and', 27915),
('to', 25732),
('a', 21881),
('in', 19536),
('that', 10237),
('is', 10011)]
## nouns freq
brown_fd_nouns = nltk.FreqDist([w.lower() for w,t in brown.tagged_words()
if any (noun_tag in t for noun_tag in ['NP','NN'])])
brown_fd_nouns.most_common(10)
brown_fd_nouns_df = pd.DataFrame(brown_fd_nouns.items(), columns=['word','freq'])
Sort the data frame:
brown_fd_nouns_df[brown_fd_nouns_df['freq']>100].sort_values(["freq","word"],ascending=[False,True])
word | freq | |
---|---|---|
243 | time | 1597 |
174 | man | 1203 |
5114 | af | 995 |
248 | years | 949 |
779 | way | 899 |
... | ... | ... |
1030 | events | 101 |
204 | james | 101 |
1723 | officer | 101 |
272 | test | 101 |
3644 | trees | 101 |
433 rows × 2 columns
Note
We can also pass the data frame to R for data exploration.
%load_ext rpy2.ipython
%%R -i brown_fd_nouns_df
library(dplyr)
brown_fd_nouns_df %>%
filter(freq > 100) %>%
arrange(desc(freq), word) %>%
head(50)
R[write to console]:
Attaching package: ‘dplyr’
R[write to console]: The following objects are masked from ‘package:stats’:
filter, lag
R[write to console]: The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
word freq
243 time 1597
174 man 1203
5114 af 995
248 years 949
779 way 899
486 people 845
1011 mr. 844
63 state 787
1099 world 787
1227 men 763
1438 life 715
303 day 687
175 year 656
875 states 586
278 work 583
299 house 582
158 mrs. 534
865 part 496
9 place 496
340 school 489
32 number 470
1801 course 465
1173 war 463
101 fact 447
590 water 444
1343 hand 423
896 government 418
229 system 416
121 night 411
1217 head 407
1869 eyes 401
756 business 393
12 city 393
72 program 388
525 group 386
371 days 384
819 room 383
656 president 382
1001 side 375
39 end 369
1246 point 369
1254 things 368
212 john 362
1061 use 361
701 case 360
354 order 359
459 children 355
356 church 348
1108 power 340
595 development 333
Conditional Frequency List#
## Word by POS Frequency Distribution
brown_news_tagged_words = brown.tagged_words(categories='news', tagset='universal')
brown_news_cfd = nltk.ConditionalFreqDist(brown_news_tagged_words)
brown_news_cfd['yield']
FreqDist({'NOUN': 5, 'VERB': 1})
## POS by Word Frequency Distribution
brown_news_cfd2 = nltk.ConditionalFreqDist([(t, w) for (w, t) in brown_news_tagged_words])
brown_news_cfd2['VERB'].most_common(10)
[('is', 732),
('was', 717),
('be', 526),
('said', 402),
('will', 388),
('are', 328),
('has', 300),
('had', 279),
('have', 265),
('were', 252)]
## Word by Genre Frequency Distribution
brown_genre_cfd = nltk.ConditionalFreqDist(
(word, genre)
for genre in brown.categories()
for word in brown.words(categories=genre)
)
brown_genre_cfd.conditions()[:50]
brown_genre_cfd['mysterious']
FreqDist({'belles_lettres': 6, 'fiction': 4, 'lore': 3, 'religion': 3, 'romance': 3, 'learned': 2, 'reviews': 2, 'adventure': 1, 'humor': 1, 'science_fiction': 1})
print(sorted(brown_genre_cfd['mysterious'].items(),key=lambda x:x[1],reverse=True)) # with freq
[('belles_lettres', 6), ('fiction', 4), ('lore', 3), ('religion', 3), ('romance', 3), ('learned', 2), ('reviews', 2), ('adventure', 1), ('humor', 1), ('science_fiction', 1)]
## Genre by Word Frequency Distribution
brown_genre_cdf2 = nltk.ConditionalFreqDist(
(genre, word)
for genre in brown.categories()
for word in brown.words(categories=genre)
)
## Genre by Word Frequency Distribution
brown_genre_cdf2 = nltk.ConditionalFreqDist(
(genre, word)
for genre in brown.categories()
for word in brown.words(categories=genre)
)
top_n_word = [word for (word, freq) in brown_fd_words.most_common(20) if word[0].isalpha()]
brown_genre_cdf2.tabulate(conditions=['adventure','editorial','fiction'],
samples=top_n_word[:10])
the of and to a in that is was for
adventure 3370 1322 1622 1309 1354 847 494 98 914 331
editorial 3508 1976 1302 1554 1095 1001 578 744 308 509
fiction 3423 1419 1696 1489 1281 916 530 144 1082 392
top_n_word2 = [word for (word, tag) in brown.tagged_words(tagset='universal')
if tag.startswith('NOUN')]
top_n_word2_fd = nltk.FreqDist(top_n_word2).most_common(10)
print(top_n_word2_fd)
brown_genre_cdf2.tabulate(conditions=['adventure','editorial','fiction'],
samples=[w for (w, f) in top_n_word2_fd])
[('time', 1555), ('man', 1148), ('Af', 994), ('years', 942), ('way', 883), ('Mr.', 844), ('people', 809), ('men', 736), ('world', 684), ('life', 676)]
time man Af years way Mr. people men world life
adventure 127 165 0 32 65 22 24 81 15 29
editorial 72 56 0 63 43 110 75 38 66 35
fiction 99 111 0 44 62 39 39 72 24 44