Text Normalization (English)#
import spacy
import unicodedata
#from contractions import CONTRACTION_MAP
import re
from nltk.corpus import wordnet
import collections
#from textblob import Word
from nltk.tokenize.toktok import ToktokTokenizer
from bs4 import BeautifulSoup
import spacy
# use spacy.load('en') if you have downloaded the language model en directly after install spacy
nlp = spacy.load('en', parse=False, tag=True, entity=False)
text = 'My system keeps crashing his crashed yesterday, ours crashes daily'
def lemmatize_text(text):
text = nlp(text)
text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
return text
lemmatize_text("My system keeps crashing! his crashed yesterday, ours crashes daily")
'My system keep crash ! his crash yesterday , ours crash daily'
from contractions import CONTRACTION_MAP
import re
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
def expand_match(contraction):
match = contraction.group(0)
first_char = match[0]
expanded_contraction = contraction_mapping.get(match)\
if contraction_mapping.get(match)\
else contraction_mapping.get(match.lower())
expanded_contraction = first_char+expanded_contraction[1:]
return expanded_contraction
expanded_text = contractions_pattern.sub(expand_match, text)
expanded_text = re.sub("'", "", expanded_text)
return expanded_text
expand_contractions("Y'all can't expand contractions I'd think")
expand_contractions("I'm very glad he's here!")
'I am very glad he is here!'
Accented Characters (Non-ASCII)#
import unicodedata
def remove_accented_chars(text):
# ```
# (NFKD) will apply the compatibility decomposition, i.e.
# replace all compatibility characters with their equivalents.
# ```
text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
return text
remove_accented_chars('Sómě Áccěntěd těxt')
# print(unicodedata.normalize('NFKD', 'Sómě Áccěntěd těxt'))
# print(unicodedata.normalize('NFKD', 'Sómě Áccěntěd těxt').encode('ascii','ignore'))
# print(unicodedata.normalize('NFKD', 'Sómě Áccěntěd těxt').encode('ascii','ignore').decode('utf-8', 'ignore'))
'Some Accented text'
Special Characters#
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
tokenizer = ToktokTokenizer() # assuming per line per sentence
# for other Tokenizer, maybe sent.tokenize should go first
stopword_list = nltk.corpus.stopwords.words('english')
def remove_stopwords(text, is_lower_case=False, stopwords=stopword_list):
tokens = tokenizer.tokenize(text)
tokens = [token.strip() for token in tokens]
if is_lower_case:
filtered_tokens = [token for token in tokens if token not in stopwords]
filtered_tokens = [token for token in tokens if token.lower() not in stopwords]
filtered_text = ' '.join(filtered_tokens)
return filtered_text
remove_stopwords("The, and, if are stopwords, computer is not")
', , stopwords , computer'