Text Normalization (English)#

import spacy
import unicodedata
#from contractions import CONTRACTION_MAP
import re
from nltk.corpus import wordnet
import collections
#from textblob import Word
from nltk.tokenize.toktok import ToktokTokenizer
from bs4 import BeautifulSoup

HTML Tags#

import requests
from bs4 import BeautifulSoup

data = requests.get('https://en.wikipedia.org/wiki/Python_(programming_language)')
content = data.content
print(content[:500])
b'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>Python (programming language) - Wikipedia</title>\n<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"3845bbc1-2be3-40cd-8882-f7'
def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    ## Can include more HTML preprocessing here...
    stripped_html_elements = soup.findAll(name='div',attrs={'id':'mw-content-text'})
    stripped_text = ' '.join([h.get_text() for h in stripped_html_elements])
    stripped_text = re.sub(r'[\r|\n|\r\n]+', '\n', stripped_text)
    return stripped_text

clean_content = strip_html_tags(content)
print(clean_content[:500])
General-purpose, high-level programming language
PythonParadigmMulti-paradigm: functional, imperative, object-oriented, structured, reflectiveDesigned byGuido van RossumDeveloperPython Software FoundationFirst appeared1990; 30 years ago (1990)[1]Stable release3.8.5
   / 20 July 2020; 2 months ago (2020-07-20)[2]Preview release3.9.0rc1
   / 11 August 2020; 41 days ago (2020-08-11)[3]
Typing disciplineDuck, dynamic, gradual (since 3.5)[4]OSLinux, macOS, Windows Vista (and newer) and moreLicensePyt

Stemming#

Lemmatization#

import spacy
# use spacy.load('en') if you have downloaded the language model en directly after install spacy
nlp = spacy.load('en', parse=False, tag=True, entity=False)
text = 'My system keeps crashing his crashed yesterday, ours crashes daily'

def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

lemmatize_text("My system keeps crashing! his crashed yesterday, ours crashes daily")
'My system keep crash ! his crash yesterday , ours crash daily'

Contractions#

from contractions import CONTRACTION_MAP
import re

def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text
expand_contractions("Y'all can't expand contractions I'd think")

expand_contractions("I'm very glad he's here!")
'I am very glad he is here!'

Accented Characters (Non-ASCII)#

import unicodedata

def remove_accented_chars(text):
#     ```
#     (NFKD) will apply the compatibility decomposition, i.e. 
#     replace all compatibility characters with their equivalents. 
#     ```
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text


remove_accented_chars('Sómě Áccěntěd těxt')

# print(unicodedata.normalize('NFKD', 'Sómě Áccěntěd těxt'))
# print(unicodedata.normalize('NFKD', 'Sómě Áccěntěd těxt').encode('ascii','ignore'))
# print(unicodedata.normalize('NFKD', 'Sómě Áccěntěd těxt').encode('ascii','ignore').decode('utf-8', 'ignore'))
'Some Accented text'

Special Characters#

Stopwords#

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
tokenizer = ToktokTokenizer() # assuming per line per sentence 
    # for other Tokenizer, maybe sent.tokenize should go first
stopword_list = nltk.corpus.stopwords.words('english')
def remove_stopwords(text, is_lower_case=False, stopwords=stopword_list):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopwords]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopwords]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

remove_stopwords("The, and, if are stopwords, computer is not")
', , stopwords , computer'

Redundant Whitespaces#

Spelling Checks#