{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Text Normalization (English)\n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import spacy\n", "import unicodedata\n", "#from contractions import CONTRACTION_MAP\n", "import re\n", "from nltk.corpus import wordnet\n", "import collections\n", "#from textblob import Word\n", "from nltk.tokenize.toktok import ToktokTokenizer\n", "from bs4 import BeautifulSoup" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## HTML Tags" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "b'\\n\\n\\n\\nPython (programming language) - Wikipedia\\n