Corpus Lingustics Methods#

  • With nltk, we can easily implement quite a few corpus-linguistic methods

    • Concordance Analysis (Simple Word Search)

    • Frequency Lists

    • Collocations

    • Data Analysis with R

    • Concordance Analysis (Patterns, Constructions?)

      • Patterns on sentence strings

      • Patterns on sentence word-tag strings

Preparing Corpus Data#

import nltk
from nltk.corpus import brown
from nltk.text import Text
import pandas as pd
import numpy as np

brown_text = Text(brown.words())

Collocations#

  • Documentation nltk.collocations

  • nltk.collocations: Get the BigramCollocationFinder which we can use to find n-grams

  • nltk.metrics: Get the BigramAssocMeasures to define collocations (It’s also available in nltk.collocations)

  • Use finder.nbest() methods to select/filter collocations

## Collocations based on Text
brown_text.collocation_list()[:10]
#brown_text.collocations()

from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder

bigram_measures = nltk.collocations.BigramAssocMeasures() # measures
finder = BigramCollocationFinder.from_words(brown.words()) # finders
## bigram collocations based on different association measures
finder.nbest(bigram_measures.likelihood_ratio,10)
finder.nbest(bigram_measures.pmi, 10)
[('$10,000-per-year', 'French-born'),
 ('$79.89', 'nothing-down'),
 ('$8.50', 'tab'),
 ("'low", 'nigras'),
 ('0.5-mv./m.', '50-percent'),
 ('0.78', 'mEq'),
 ('1,100', 'circumscriptions'),
 ('1,257,700', 'non-farm'),
 ('11-inch', 'headroom'),
 ('11-shot', 'hammerless')]

Apply Filters#

We can create an anonymous function as a helper to remove irrelevant word tokens before collocation computation.

For example, we remove:

  • word tokens whose char length < 3

  • word tokens that belong to the stopwords

  • word tokens that include at least one non-alphabetic char

## Apply freq-based filers for bigram collocations
finder.apply_freq_filter(10)

## Apply word filer function
from nltk.corpus import stopwords
stop_words_en = stopwords.words('english')


filter_stops = lambda w: len(w)<3 or w in stop_words_en or not w.isalpha()


finder.apply_word_filter(filter_stops) # filter on word tokens
finder.apply_freq_filter(10) # filter on bigram min frequencies 
finder.nbest(bigram_measures.likelihood_ratio, 10)
finder.nbest(bigram_measures.pmi, 10)
[('Hong', 'Kong'),
 ('Viet', 'Nam'),
 ('Pathet', 'Lao'),
 ('Simms', 'Purdew'),
 ('Internal', 'Revenue'),
 ('Puerto', 'Rico'),
 ('Saxon', 'Shore'),
 ('carbon', 'tetrachloride'),
 ('unwed', 'mothers'),
 ('Armed', 'Forces')]

POS Collocations#

## Create collcoations based on tags only
finder = BigramCollocationFinder.from_words(
    t for w, t in brown.tagged_words(tagset='universal') if t != 'X')
finder.nbest(bigram_measures.likelihood_ratio, 10)
[('ADP', 'DET'),
 ('DET', 'NOUN'),
 ('PRON', 'VERB'),
 ('ADJ', 'NOUN'),
 ('NOUN', '.'),
 ('NOUN', 'DET'),
 ('DET', 'ADJ'),
 ('NOUN', 'ADP'),
 ('PRT', 'VERB'),
 ('ADP', '.')]

Collocations based on Skipped Bigrams#

## Create collocations with intervneing words (gapped n-grams)
finder = BigramCollocationFinder.from_words(brown.words(), window_size=2)
finder.apply_word_filter(filter_stops)
finder.apply_freq_filter(10)
finder.nbest(bigram_measures.likelihood_ratio, 10)
[('United', 'States'),
 ('New', 'York'),
 ('per', 'cent'),
 ('Rhode', 'Island'),
 ('years', 'ago'),
 ('Los', 'Angeles'),
 ('White', 'House'),
 ('Peace', 'Corps'),
 ('World', 'War'),
 ('San', 'Francisco')]

Scoring Ngrams#

## Finders
scored = finder.score_ngrams(bigram_measures.raw_freq)
scored[:10]
[(('United', 'States'), 0.0003375841376792124),
 (('New', 'York'), 0.00025491047130879306),
 (('per', 'cent'), 0.00012573286760501277),
 (('years', 'ago'), 0.0001171210273580941),
 (('The', 'first'), 8.267366637041936e-05),
 (('Rhode', 'Island'), 7.750656222226816e-05),
 (('could', 'see'), 7.492301014819255e-05),
 (('last', 'year'), 5.856051367904705e-05),
 (('first', 'time'), 5.769932965435518e-05),
 (('White', 'House'), 5.5976961604971446e-05)]
scored = finder.above_score(bigram_measures.pmi, min_score = 15)
for s in scored:
    print(s)
('Hong', 'Kong')
('Viet', 'Nam')
('Pathet', 'Lao')
('Simms', 'Purdew')
('Internal', 'Revenue')
('Puerto', 'Rico')
('Saxon', 'Shore')
('carbon', 'tetrachloride')
('unwed', 'mothers')

Retrieve n-grams#

Note

The * is the unpack the nested list and take each element as the input of the function call. So we can use * to unlist a nested list. (Similar to unlist() in R.

def compute_ngrams(sequence, n):
    return list(zip(*(sequence[index:] for index in range(n))))

print(compute_ngrams([1,2,3,4], 2))
print(compute_ngrams([1,2,3,4,5],3))
[(1, 2), (2, 3), (3, 4)]
[(1, 2, 3), (2, 3, 4), (3, 4, 5)]

Dispersion#

  • Dispersion of a linguistic unit is also important.

  • There should be a metric that indicates how evenly distributed the linguistic unit is.

Note

How to get the document frequency of the bigrams???

unigram_freq = nltk.FreqDist(brown.words())
bigram_freq = nltk.FreqDist('_'.join(x) for x in nltk.bigrams(brown.words()))
# ngram freq list of each file in the corpus
unigram_freq_per_file = [nltk.FreqDist(words) 
                         for words in [brown.words(fileids=f) for f in brown.fileids()]]
bigram_freq_per_file = [nltk.FreqDist('_'.join(x) for x in nltk.bigrams(words))
                         for words in [brown.words(fileids=f) for f in brown.fileids()]]
## Function to get unigram dispersion
def createDipsersionDist(uni_freq, uni_freq_per_file):
    len(uni_freq_per_file)
    unigram_dispersion = {}

    for fid in uni_freq_per_file:
        for w, f in fid.items():
            if w in unigram_dispersion:
                unigram_dispersion[w] += 1
            else:
                unigram_dispersion[w] = 1
    return(unigram_dispersion)
unigram_dispersion = createDipsersionDist(unigram_freq, unigram_freq_per_file)
# Dictionary cannot be sliced/subset
# Get the items() and convert to list for subsetting
list(unigram_dispersion.items())[:20]
[('The', 500),
 ('Fulton', 3),
 ('County', 45),
 ('Grand', 17),
 ('Jury', 4),
 ('said', 314),
 ('Friday', 34),
 ('an', 498),
 ('investigation', 34),
 ('of', 500),
 ("Atlanta's", 2),
 ('recent', 114),
 ('primary', 59),
 ('election', 28),
 ('produced', 66),
 ('``', 462),
 ('no', 455),
 ('evidence', 119),
 ("''", 463),
 ('that', 500)]
#dict(sorted(bigram_freq.items()[:3]))
list(bigram_freq.items())[:20]
[('The_Fulton', 1),
 ('Fulton_County', 6),
 ('County_Grand', 1),
 ('Grand_Jury', 2),
 ('Jury_said', 1),
 ('said_Friday', 4),
 ('Friday_an', 1),
 ('an_investigation', 7),
 ('investigation_of', 15),
 ("of_Atlanta's", 1),
 ("Atlanta's_recent", 1),
 ('recent_primary', 1),
 ('primary_election', 2),
 ('election_produced', 1),
 ('produced_``', 1),
 ('``_no', 6),
 ('no_evidence', 14),
 ("evidence_''", 3),
 ("''_that", 16),
 ('that_any', 31)]
bigram_dispersion = createDipsersionDist(bigram_freq, bigram_freq_per_file)
list(bigram_dispersion.items())[:20]
[('The_Fulton', 1),
 ('Fulton_County', 1),
 ('County_Grand', 1),
 ('Grand_Jury', 2),
 ('Jury_said', 1),
 ('said_Friday', 3),
 ('Friday_an', 1),
 ('an_investigation', 7),
 ('investigation_of', 14),
 ("of_Atlanta's", 1),
 ("Atlanta's_recent", 1),
 ('recent_primary', 1),
 ('primary_election', 2),
 ('election_produced', 1),
 ('produced_``', 1),
 ('``_no', 6),
 ('no_evidence', 12),
 ("evidence_''", 3),
 ("''_that", 16),
 ('that_any', 30)]
type(unigram_freq)
type(unigram_dispersion)
dict

Note

We can implement the Delta P dispersion metric proposed by Gries (2008).

Delta P#

  • This is a directional association metric.

## Inherit BigramAssocMeasures
class AugmentedBigramAssocMeasures(BigramAssocMeasures):
    @classmethod
    def raw_freq2(cls,*marginals):          
        """Scores ngrams by their frequency"""
        n_ii, n_io, n_oi, n_oo = cls._contingency(*marginals)
        return n_ii
    
    @classmethod
    def dp_fwd(cls, *marginals):
        """Scores bigrams using DP forward
        This may be shown with respect to a contingency table::

                w1    ~w1
             ------ ------
         w2 | n_ii | n_oi | = n_xi
             ------ ------
        ~w2 | n_io | n_oo |
             ------ ------
             = n_ix        TOTAL = n_xx
        """
        
        n_ii, n_oi, n_io, n_oo = cls._contingency(*marginals)

        return (n_ii/(n_ii+n_io)) - (n_oi/(n_oi+n_oo))

    @classmethod
    def dp_bwd(cls, *marginals):
        """Scores bigrams using DP backward
        This may be shown with respect to a contingency table::

                w1    ~w1
             ------ ------
         w2 | n_ii | n_oi | = n_xi
             ------ ------
        ~w2 | n_io | n_oo |
             ------ ------
             = n_ix        TOTAL = n_xx
        """
        
        n_ii, n_oi, n_io, n_oo = cls._contingency(*marginals)

        return (n_ii/(n_ii+n_oi)) - (n_io/(n_io+n_oo))
bigram_measures = AugmentedBigramAssocMeasures()
finder = BigramCollocationFinder.from_words(brown.words())
#finder.apply_freq_filter(10)
bigrams_dpfwd = finder.score_ngrams(bigram_measures.dp_fwd)
bigrams_dpfwd[:10]
[(('$10,000-per-year', 'French-born'), 1.0),
 (('$79.89', 'nothing-down'), 1.0),
 (('$8.50', 'tab'), 1.0),
 (("'low", 'nigras'), 1.0),
 (('0.5-mv./m.', '50-percent'), 1.0),
 (('0.78', 'mEq'), 1.0),
 (('1,100', 'circumscriptions'), 1.0),
 (('1,257,700', 'non-farm'), 1.0),
 (('11-inch', 'headroom'), 1.0),
 (('11-shot', 'hammerless'), 1.0)]
bigrams_dpbwd = finder.score_ngrams(bigram_measures.dp_bwd)
bigrams_dpbwd[:10]
[(('$10,000-per-year', 'French-born'), 1.0),
 (('$79.89', 'nothing-down'), 1.0),
 (('$8.50', 'tab'), 1.0),
 (("'low", 'nigras'), 1.0),
 (('0.5-mv./m.', '50-percent'), 1.0),
 (('0.78', 'mEq'), 1.0),
 (('1,100', 'circumscriptions'), 1.0),
 (('1,257,700', 'non-farm'), 1.0),
 (('11-inch', 'headroom'), 1.0),
 (('11-shot', 'hammerless'), 1.0)]

Checking Computation Accuracy#

  • Check if DP is correctly computed.

bigrams_rawfreq = finder.score_ngrams(bigram_measures.raw_freq2)
bigrams_rawfreq[:10]
[(('of', 'the'), 9625.0),
 ((',', 'and'), 6288.0),
 (('.', 'The'), 6081.0),
 (('in', 'the'), 5546.0),
 ((',', 'the'), 3754.0),
 (('.', '``'), 3515.0),
 (('to', 'the'), 3426.0),
 (("''", '.'), 3332.0),
 ((';', ';'), 2784.0),
 (('.', 'He'), 2660.0)]
unigrams_rawfreq = nltk.FreqDist(brown.words())
w1f = unigrams_rawfreq['of']
w2f = unigrams_rawfreq['the']
w1w2 = [freq for (w1,w2),freq in bigrams_rawfreq if w1=="of" and w2=="the"][0]
corpus_size = np.sum(list(unigrams_rawfreq.values()))

"""
        w1     _w1
w2      w1w2   ____    w2f
_w2     ____   ____
        w1f            corpus_size
"""

print(w1f, w2f, w1w2,corpus_size)
36080 62713 9625.0 1161192
print('Delta P Forward for `of the`:', (w1w2/(w1f))-((w2f-w1w2)/(corpus_size-w1f)))
print('Delta P Backward for `of the`:', (w1w2/(w2f))-((w1f-w1w2)/(corpus_size-w2f)))
Delta P Forward for `of the`: 0.2195836568422283
Delta P Backward for `of the`: 0.12939364991590951
print([dp for (w1, w2),dp in bigrams_dpfwd if w1=="of" and w2=="the"])
print([dp for (w1, w2),dp in bigrams_dpbwd if w1=="of" and w2=="the"])
[0.2195836568422283]
[0.12939364991590951]

Note

How to implement the delta P of trigrams?

# inherit Trigram
from nltk.collocations import TrigramAssocMeasures, TrigramCollocationFinder
class AugmentedTrigramAssocMeasures(TrigramAssocMeasures):
    """
    A collection of trigram association measures. Each association measure
        is provided as a function with four arguments::

            trigram_score_fn(n_iii,
                             (n_iix, n_ixi, n_xii),
                             (n_ixx, n_xix, n_xxi),
                             n_xxx)

        The arguments constitute the marginals of a contingency table, counting
        the occurrences of particular events in a corpus. The letter i in the
        suffix refers to the appearance of the word in question, while x indicates
        the appearance of any word. Thus, for example:
        n_iii counts (w1, w2, w3), i.e. the trigram being scored
        n_ixx counts (w1, *, *)
        n_xxx counts (*, *, *), i.e. any trigram
    """
    
    @classmethod
    def dp_fwd(cls, *marginals):
        """
        Scores trigrams using delta P forward, i.e. conditional prob of w3 given w1,w2
        minus conditional prob of w3, in the absence of w1,w2
        """
        n_iii, n_oii, n_ioi, n_ooi, n_iio, n_oio, n_ioo, n_ooo = cls._contingency(*marginals)

        return ((n_iii)/(n_iii+n_iio)) - ((n_ooi)/(n_ooi+n_ooo))
    @classmethod
    def dp_bwd(cls, *marginals):
        """
        Scores trigrams using delta P backward, i.e. conditional prob of w1 given w2,w3
        minus conditional prob of w1, in the absence of w2,w3
        """
        n_iii, n_oii, n_ioi, n_ooi, n_iio, n_oio, n_ioo, n_ooo = cls._contingency(*marginals)

        return ((n_iii)/(n_iii+n_oii)) - ((n_ioo)/(n_ioo+n_ooo))
trigram_measures = AugmentedTrigramAssocMeasures()
finder3 = TrigramCollocationFinder.from_words(brown.words())
finder3.apply_freq_filter(10)
finder3.nbest(trigram_measures.pmi, 10)
[("Drug's", 'chemical', 'name'),
 ('Brown', '&', 'Sharpe'),
 ('B.', '&', 'O.'),
 ('per', 'capita', 'income'),
 ('John', 'A.', 'Notte'),
 ('average', 'per', 'capita'),
 ('General', 'Motors', 'stock'),
 ('basic', 'wage', 'rate'),
 ('World', 'War', '2'),
 ('New', 'York', 'Times')]
finder3.nbest(trigram_measures.dp_fwd, 10)
[('the', 'Lo', 'Shu'),
 ('average', 'per', 'capita'),
 ('of', 'Economic', 'Affairs'),
 ('the', 'minimal', 'polynomial'),
 ('B.', '&', 'O.'),
 ('the', 'Export-Import', 'Bank'),
 ('Chamber', 'of', 'Commerce'),
 ('Notte', ',', 'Jr.'),
 ('.', "Drug's", 'chemical'),
 ('v.', 'United', 'States')]
finder3.nbest(trigram_measures.dp_bwd,10)
[('Puerto', 'Rico', ','),
 ('Los', 'Angeles', ','),
 ('dominant', 'stress', 'will'),
 ('couple', 'of', 'weeks'),
 ('A.', 'Notte', ','),
 ('United', 'States', 'is'),
 ('Brown', '&', 'Sharpe'),
 ('Department', 'of', 'Economic'),
 ('boys', 'and', 'girls'),
 ('General', 'Motors', 'stock')]

Concordance#

## Simple Concordances
brown_text.concordance('American', width=79, lines = 5)
Displaying 5 of 569 matches:
will deliver tomorrow night to the American people over nationwide television 
ocial security taxes on 70 million American workers would be raised to pay the
o retired as vice president of the American Screw Co. in 1955 said , `` Both p
wice elected overwhelmingly by the American people as president of the United 
n example : Last month in Ghana an American missionary discovered when he came
#nltk.app.concordance()
## Regular Expression Concordances
import re
sents = [' '.join(s) for s in brown.sents()]
regex_1 = r'(is|was) \w+ing'
targets = [sent for sent in sents[:100] if re.search(regex_1, sent)]
targets[0]
#if targets:
#    for match in targets:
#        print(match.strip())
"The City Purchasing Department , the jury said , `` is lacking in experienced clerical personnel as a result of city personnel policies '' ."

Frequency List#

## word frequencies
brown_fd_words = nltk.FreqDist(brown.words())
brown_fd_words.most_common(10)
[('the', 62713),
 (',', 58334),
 ('.', 49346),
 ('of', 36080),
 ('and', 27915),
 ('to', 25732),
 ('a', 21881),
 ('in', 19536),
 ('that', 10237),
 ('is', 10011)]
## nouns freq
brown_fd_nouns = nltk.FreqDist([w.lower() for w,t in brown.tagged_words() 
                                 if any (noun_tag in t for noun_tag in ['NP','NN'])])
brown_fd_nouns.most_common(10)

brown_fd_nouns_df = pd.DataFrame(brown_fd_nouns.items(), columns=['word','freq'])

Sort the data frame:

brown_fd_nouns_df[brown_fd_nouns_df['freq']>100].sort_values(["freq","word"],ascending=[False,True])
word freq
243 time 1597
174 man 1203
5114 af 995
248 years 949
779 way 899
... ... ...
1030 events 101
204 james 101
1723 officer 101
272 test 101
3644 trees 101

433 rows × 2 columns

Note

We can also pass the data frame to R for data exploration.

%load_ext rpy2.ipython
%%R -i brown_fd_nouns_df

library(dplyr)
brown_fd_nouns_df %>%
filter(freq > 100) %>%
arrange(desc(freq), word) %>% 
head(50)
R[write to console]: 
Attaching package: ‘dplyr’


R[write to console]: The following objects are masked from ‘package:stats’:

    filter, lag


R[write to console]: The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union
            word freq
243         time 1597
174          man 1203
5114          af  995
248        years  949
779          way  899
486       people  845
1011         mr.  844
63         state  787
1099       world  787
1227         men  763
1438        life  715
303          day  687
175         year  656
875       states  586
278         work  583
299        house  582
158         mrs.  534
865         part  496
9          place  496
340       school  489
32        number  470
1801      course  465
1173         war  463
101         fact  447
590        water  444
1343        hand  423
896   government  418
229       system  416
121        night  411
1217        head  407
1869        eyes  401
756     business  393
12          city  393
72       program  388
525        group  386
371         days  384
819         room  383
656    president  382
1001        side  375
39           end  369
1246       point  369
1254      things  368
212         john  362
1061         use  361
701         case  360
354        order  359
459     children  355
356       church  348
1108       power  340
595  development  333

Conditional Frequency List#

## Word by POS Frequency Distribution

brown_news_tagged_words = brown.tagged_words(categories='news', tagset='universal')
brown_news_cfd = nltk.ConditionalFreqDist(brown_news_tagged_words)
brown_news_cfd['yield']
FreqDist({'NOUN': 5, 'VERB': 1})
## POS by Word Frequency Distribution
brown_news_cfd2 = nltk.ConditionalFreqDist([(t, w) for (w, t) in brown_news_tagged_words])
brown_news_cfd2['VERB'].most_common(10)
[('is', 732),
 ('was', 717),
 ('be', 526),
 ('said', 402),
 ('will', 388),
 ('are', 328),
 ('has', 300),
 ('had', 279),
 ('have', 265),
 ('were', 252)]
## Word by Genre Frequency Distribution
brown_genre_cfd = nltk.ConditionalFreqDist(
    (word, genre)
    for genre in brown.categories()
    for word in brown.words(categories=genre)
)
brown_genre_cfd.conditions()[:50]
brown_genre_cfd['mysterious']
FreqDist({'belles_lettres': 6, 'fiction': 4, 'lore': 3, 'religion': 3, 'romance': 3, 'learned': 2, 'reviews': 2, 'adventure': 1, 'humor': 1, 'science_fiction': 1})
print(sorted(brown_genre_cfd['mysterious'].items(),key=lambda x:x[1],reverse=True)) # with freq
[('belles_lettres', 6), ('fiction', 4), ('lore', 3), ('religion', 3), ('romance', 3), ('learned', 2), ('reviews', 2), ('adventure', 1), ('humor', 1), ('science_fiction', 1)]
## Genre by Word Frequency Distribution
brown_genre_cdf2 = nltk.ConditionalFreqDist(
    (genre, word)
    for genre in brown.categories()
    for word in brown.words(categories=genre)
)
## Genre by Word Frequency Distribution
brown_genre_cdf2 = nltk.ConditionalFreqDist(
    (genre, word)
    for genre in brown.categories()
    for word in brown.words(categories=genre)
)
top_n_word = [word for (word, freq) in brown_fd_words.most_common(20) if word[0].isalpha()]

brown_genre_cdf2.tabulate(conditions=['adventure','editorial','fiction'],
                         samples=top_n_word[:10])
           the   of  and   to    a   in that   is  was  for 
adventure 3370 1322 1622 1309 1354  847  494   98  914  331 
editorial 3508 1976 1302 1554 1095 1001  578  744  308  509 
  fiction 3423 1419 1696 1489 1281  916  530  144 1082  392 
top_n_word2 = [word for (word, tag) in brown.tagged_words(tagset='universal') 
               if tag.startswith('NOUN')]
top_n_word2_fd = nltk.FreqDist(top_n_word2).most_common(10)
print(top_n_word2_fd)
brown_genre_cdf2.tabulate(conditions=['adventure','editorial','fiction'],
                         samples=[w for (w, f) in top_n_word2_fd])
[('time', 1555), ('man', 1148), ('Af', 994), ('years', 942), ('way', 883), ('Mr.', 844), ('people', 809), ('men', 736), ('world', 684), ('life', 676)]
            time    man     Af  years    way    Mr. people    men  world   life 
adventure    127    165      0     32     65     22     24     81     15     29 
editorial     72     56      0     63     43    110     75     38     66     35 
  fiction     99    111      0     44     62     39     39     72     24     44