Chinese Natural Language Processing (spaCy)#

Installation#

# Install package
## In terminal:
!pip install spacy

## Download language model for Chinese and English
!spacy download en
!python -m spacy download zh_core_web_lg
import spacy
from spacy import displacy
# load language model
nlp_zh = spacy.load('zh_core_web_sm')## disable=["parser"]
# parse text 
doc = nlp_zh('這是一個中文的句子')
Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/n7/ltpzwx813c599nfxfb94s_640000gn/T/jieba.cache
Loading model cost 0.742 seconds.
Prefix dict has been built successfully.

Linguistic Features#

  • After we parse and tag a given text, we can extract token-level information:

    • Text: the original word text

    • Lemma: the base form of the word

    • POS: the simple universal POS tag

    • Tag: the detailed POS tag

    • Dep: Syntactic dependency

    • Shape: Word shape (capitalization, punc, digits)

    • is alpha

    • is stop

# parts of speech tagging
for token in doc:
    print(((token.text, 
            token.lemma_, 
            token.pos_, 
            token.tag_,
            token.dep_,
            token.shape_,
            token.is_alpha,
            token.is_stop,
            )))
('這', '這', 'ADV', 'AD', 'advmod', 'x', True, False)
('是', '是', 'VERB', 'VC', 'cop', 'x', True, True)
('一', '一', 'NUM', 'CD', 'nummod', 'x', True, True)
('個', '個', 'NUM', 'M', 'mark:clf', 'x', True, False)
('中文', '中文', 'NOUN', 'NN', 'nmod:assmod', 'xx', True, False)
('的', '的', 'PART', 'DEG', 'case', 'x', True, True)
('句子', '句子', 'NOUN', 'NN', 'ROOT', 'xx', True, False)
## Output in different ways
for token in doc:
    print('%s_%s' % (token.text, token.tag_))
    
out = ''
for token in doc:
    out = out + ' '+ '/'.join((token.text, token.tag_))
print(out)
# Noun chunking not working??
for n in doc.noun_chunks:
    print(n.text)
## Check meaning of a POS tag (Not working??)
spacy.explain('VC')

Visualization Linguistic Features#

# Visualize
displacy.render(doc, style="dep")
ADV VERB NUM NUM 中文 NOUN PART 句子 NOUN advmod cop nummod mark:clf nmod:assmod case
options = {"compact": True, "bg": "#09a3d5",
           "color": "white", "font": "Source Sans Pro",
          "distance": 120}
displacy.render(doc, style="dep", options=options)
ADV VERB NUM NUM 中文 NOUN PART 句子 NOUN advmod cop nummod mark:clf nmod:assmod case
## longer paragraphs
text_long = '''武漢肺炎全球肆虐,至今已有2906萬人確診、92萬染疫身亡,而流亡美國的中國大陸病毒學家閻麗夢,14日時開通了推特帳號,並公布一份長達26頁的科學論文,研究直指武肺病毒與自然人畜共通傳染病的病毒不同,並呼籲追查武漢P4實驗室及美國衛生研究院(NIH)之間的金流,引發討論。'''
text_long_list = text_long.split(sep=",")
len(text_long_list)

for c in text_long_list:
    print(c)
武漢肺炎全球肆虐
至今已有2906萬人確診、92萬染疫身亡
而流亡美國的中國大陸病毒學家閻麗夢
14日時開通了推特帳號
並公布一份長達26頁的科學論文
研究直指武肺病毒與自然人畜共通傳染病的病毒不同
並呼籲追查武漢P4實驗室及美國衛生研究院(NIH)之間的金流
引發討論。
## parse the texts
doc2 = list(nlp_zh.pipe(text_long_list))
len(doc2)
8
# Visual dependency for each sentence-like chunk
sentence_spans = list(doc2)
options = {"compact": True, "bg": "#09a3d5",
           "color": "white", "font": "Source Sans Pro",
          "distance": 120}
displacy.render(sentence_spans, style="dep", options=options)
武漢 VERB 肺炎 NOUN 全球 NOUN 肆虐 VERB nsubj nsubj 至今 ADV ADV VERB 2906 NUM 萬人 NOUN 確診、 VERB 92 NUM ADV 染疫 VERB 身亡 VERB advmod advmod dep nsubj ccomp dep mark:clf conj dobj ADV 流亡 ADJ 美國 NOUN PART 中國 PROPN 大陸 ADV 病毒 NOUN 學家 NOUN 閻麗 NOUN NOUN advmod acl dobj mark compound:nn amod nsubj amod nsubj 14日 NOUN PROPN 開通 NOUN PART 推特 ADJ 帳號 NOUN nmod:tmod nsubj aux:asp compound:nn dobj PROPN 公布 VERB NUM NUM 長達 NOUN 26 NUM NOUN PART 科學 NOUN 論文 NOUN nsubj nummod mark:clf acl nmod:range mark:clf mark dobj ccomp 研究 VERB 直指 ADJ 武肺 NOUN 病毒 NOUN NOUN 自然人 NOUN 畜共 PROPN 通傳 VERB 染病 VERB PART 病毒 NOUN 不同 VERB acl compound:vc amod compound:nn compound:nn compound:nn compound:nn compound:nn conj mark nsubj PROPN 呼籲 NOUN 追查 VERB 武漢 NOUN P4 NOUN VERB 驗室 NOUN CCONJ 美國 NOUN 衛生 NOUN 研究院( NOUN NIH) PROPN PART NOUN PART 金流 NOUN nmod:assmod nsubj compound:nn nsubj acl dobj cc amod compound:nn compound:nn nmod:assmod case nmod:assmod case dobj 引發 VERB 討論。 NOUN dobj
colors = {"CARDINAL": "linear-gradient(90deg, #aa9cfc, #fc9ce7)"}
options = {"ents": ["CARDINAL"], "colors": colors}


displacy.render(sentence_spans[1], style="ent")
至今已有 2906 CARDINAL 萬人確診、 92 CARDINAL 萬染疫身亡

NP Chunking#

## Print noun phrase for each doc
for d in doc2:
    for np in d.noun_chunks:
        print(np.text, 
              np.root.text,
              np.root.dep_,
              np.root.head.text)
    print('---')
---
---
---
---
---
---
---
---

Named Entity Recognition#

  • Text: original entity text

  • Start: index of start of entity in the Doc

  • End: index of end of entity in the Doc

  • Label: Entity label, type

## Print ents for each doc
for d in doc2:
    for e in d.ents:
        print(e.text, e.label_)
    print('---')
---
2906 CARDINAL
92 CARDINAL
---
---
14日 CARDINAL
---
26 CARDINAL
---
---
---
---