Network Analysis#
DEMO_DATA_ROOT = "../../../RepositoryData/data"
# from google.colab import drive
# drive.mount('/content/drive')
Word Similarities from Embeddings#
If necessary, install spacy
and the Chinese language model zh_core_web_lg
(glove embeddings).
# !pip install spacy==2.3
# !spacy download zh_core_web_lg
# !pip install pyvis
Load the packages.
import spacy
import numpy as np
import pandas as pd
import unicodedata
nlp_zh = spacy.load('zh_core_web_lg')
near_syns = ['覺得','認為','宣稱','表示','強調','顯示', '說明','指出','提出','主張']
Inspect the word vectors matrix from the spacy model.
glove_word_vectors = nlp_zh.vocab.vectors
print('Spacy GloVe word vectors Shape: (vocab_size, embedding_dim)',glove_word_vectors.shape)
len(glove_word_vectors)
Spacy GloVe word vectors Shape: (vocab_size, embedding_dim) (500000, 300)
500000
Pairwise similarities of the words in the near-syns
w1 = nlp_zh.vocab['認為']
w2 = nlp_zh.vocab['覺得']
# pairwise similarities of near-syns
def pairwise_similarity(word_list, nlp):
word_sim_matrix = np.ones(shape=(len(word_list),len(word_list)))
for i, w1 in enumerate(word_list):
#print(str(i) + ' '+w1)
for j, w2 in enumerate(word_list):
if w1 !=w2:
word_sim_matrix[i,j] = nlp.vocab[str(w1)].similarity(nlp.vocab[str(w2)])
return(word_sim_matrix)
pd.DataFrame(data= np.round(pairwise_similarity(near_syns, nlp_zh),2),
index=near_syns,
columns=near_syns)
覺得 | 認為 | 宣稱 | 表示 | 強調 | 顯示 | 說明 | 指出 | 提出 | 主張 | |
---|---|---|---|---|---|---|---|---|---|---|
覺得 | 1.00 | 0.69 | 0.47 | 0.18 | 0.55 | 0.37 | 0.48 | 0.16 | 0.01 | 0.46 |
認為 | 0.69 | 1.00 | 0.72 | 0.39 | 0.77 | 0.58 | 0.59 | 0.45 | 0.19 | 0.73 |
宣稱 | 0.47 | 0.72 | 1.00 | 0.38 | 0.71 | 0.58 | 0.60 | 0.38 | 0.20 | 0.70 |
表示 | 0.18 | 0.39 | 0.38 | 1.00 | 0.43 | 0.27 | 0.23 | 0.78 | 0.48 | 0.22 |
強調 | 0.55 | 0.77 | 0.71 | 0.43 | 1.00 | 0.55 | 0.69 | 0.49 | 0.28 | 0.73 |
顯示 | 0.37 | 0.58 | 0.58 | 0.27 | 0.55 | 1.00 | 0.62 | 0.30 | 0.06 | 0.37 |
說明 | 0.48 | 0.59 | 0.60 | 0.23 | 0.69 | 0.62 | 1.00 | 0.29 | 0.19 | 0.54 |
指出 | 0.16 | 0.45 | 0.38 | 0.78 | 0.49 | 0.30 | 0.29 | 1.00 | 0.62 | 0.27 |
提出 | 0.01 | 0.19 | 0.20 | 0.48 | 0.28 | 0.06 | 0.19 | 0.62 | 1.00 | 0.27 |
主張 | 0.46 | 0.73 | 0.70 | 0.22 | 0.73 | 0.37 | 0.54 | 0.27 | 0.27 | 1.00 |
To reduce the computation cost, extract the vocabulary of the Chinense model by excluding:
ascii characters
digits
punctuations
And also, consider only two-character words.
vocab = list(nlp_zh.vocab.strings)
#vocab = [w.text for w in nlp_zh.vocab if np.count_nonzero(w.vector) and not w.is_ascii and not w.is_punct]
print(len(vocab))
print(vocab[20000:20200])
544337
['2022', '022', '乌拉特后旗', '特后旗', '温差', '湘潭', 'play', '留學', '索取', '透明度', '孤立', '伊始', '安全法', '故居', '中医院', '番茄', '07月', '历任', '預算', '十字', '手柄', '利润率', '133', '涛', 'Office', '宝博', '企稳', '加�', '代辦', '紧缺', '重现', '冲着', '大利', '播种', '随手', '克什克腾旗', '克腾旗', "'s", "'x", '换来', '受灾', '亮眼', '峦�', '峦', '计数', '操穴', '100米', '00米', '展品', '帶動', '前任', 'a站', '表率', '社科', '供奉', '安检', '吉泽明', '学生会', '三线', '清凉', '取暖', '隐蔽', '无所谓', '不在乎', '粗大', '串联', '切尔西', '時光', '增殖', '宜宾市', '溫暖', '燕子', '燕', '后天', '冒出', '權力', '倫敦', '波司登', '胜地', '值当', '康健', '协和', '朴素', '胸口', '樱花', '樱', '孔明', '少许', '嵌入', '镍', '掘金', '掘', '推�', '项链', '包赢', '制作人', '增产', '交流区', '妆品', '妆', '温恒', '未婚', '非金属', '事前', '台账', '强强', '银行家', '大树', '小哥', '纱', '肤色', '肤', '陡然', '陡', '打水', '電源', '项目部', '樂團', '兩位', '来不及', '邻家', '外星人', '黄网站', '南充', '市直', '带入', '電影院', '摔倒', '礼服', '建造师', '5', '自拍区', '贯通', '沿岸', '透玩', 'LOGO', 'logo', 'OGO', '他家', '领空', '稀少', '13%', '山林', '频w', '算单', '田野', '猜想', '這裏', '增強', '文山', '不俗', '收費', '配电', '利害', '萌', '捡', '开播', '依规', '深知', '株洲', '产视', '\x07\x06\x05', '玩场', '事務', '常州市', 'MB', 'mb', '一早', '乐网', '祖母', '二季度', 'AV天堂', 'V天堂', 'XXxx', '石川', '解散', '天国', '开房', '野狼', '法语', '承德市', '赶来', '光临', '涉事', '著作权', '老人们', '怀化', '节水', '米兜', '腰部', '尤物', '频自', '黑洞', '编导', '永利网', '乾淨', '戀愛', '戀', '私阴', '脱水']
For each near-syn, we should find the word similarities between the near-syn and all the other words in the NLP vocabulary.
Take the first near-syn for example.
%%time
target_word = '覺得'
word_sim = []
# check each word in vocab its simi with target_Word
target_word_vocab = nlp_zh.vocab[target_word]
for w in vocab:
w_vocab = nlp_zh.vocab[w]
if w_vocab.vector is not None and np.count_nonzero(w_vocab.vector) and not w_vocab.is_ascii and not w_vocab.is_punct and w!=target_word:
word_sim.append((w, target_word_vocab.similarity(w_vocab)))
CPU times: user 42.6 s, sys: 761 ms, total: 43.3 s
Wall time: 42.9 s
Extract the top 10 words that are similar to the first near syn.
sorted(word_sim, key=lambda x:x[1], reverse=True)[:10]
[('覺', 0.84788847),
('其實', 0.79569775),
('會覺', 0.788269),
('以為', 0.78638524),
('感覺', 0.7840089),
('看來', 0.7798325),
('畢竟', 0.7633344),
('看起來', 0.7629494),
('因為', 0.7625315),
('討厭', 0.74918205)]
Each vocab
has several properties defined in spacy that are useful for filtering irrelevant words before computing the word similarities
Note
The Chinese spacy language model does not seem to include the word probability information.
#w.is_lower == word.is_lower and w.prob >= -15
w1 = nlp_zh.vocab['覺得']
w2 = nlp_zh.vocab['ship']
print(w2.is_ascii)
print(w2.is_currency)
print(w2.is_punct)
True
False
False
Define functions to extract top-N similar words
Functions taken from this SO discussion thread
Deal with the computation efficiency problems (big matrices)
import numba
from numba import jit
@jit(nopython=True)
def cosine_similarity_numba(u:np.ndarray, v:np.ndarray):
assert(u.shape[0] == v.shape[0])
uv = 0
uu = 0
vv = 0
for i in range(u.shape[0]):
uv += u[i]*v[i]
uu += u[i]*u[i]
vv += v[i]*v[i]
cos_theta = 1
if uu != 0 and vv != 0:
cos_theta = uv/np.sqrt(uu*vv)
return cos_theta
## Efficient version
def most_similar_v1(word, topn=5):
word = nlp_zh.vocab[str(word)]
queries = [
w for w in nlp_zh.vocab
if np.count_nonzero(w.vector) and not w.is_ascii and not w.is_punct and len(w.text)==2
]
#by_similarity = sorted(queries, key=lambda w: word.similarity(w), reverse=True)
by_similarity = sorted(queries, key=lambda w: cosine_similarity_numba(w.vector, word.vector), reverse=True)
return [(w.text,w.similarity(word)) for w in by_similarity[:topn+1] if w.text != word.text]
## Naive version
def most_similar_v2(word, topn=5):
word = nlp_zh.vocab[str(word)]
queries = [
w for w in nlp_zh.vocab
if np.count_nonzero(w.vector) and not w.is_ascii and not w.is_punct and len(w.text)==2
]
by_similarity = sorted(queries, key=lambda w: word.similarity(w), reverse=True)
#by_similarity = sorted(queries, key=lambda w: cosine_similarity_numba(w.vector, word.vector), reverse=True)
return [(w.text,w.similarity(word)) for w in by_similarity[:topn+1] if w.text != word.text]
Test the time needed in different versions
%%time
most_similar_v1("覺得", topn=3)
CPU times: user 7.42 s, sys: 309 ms, total: 7.73 s
Wall time: 14.9 s
[('其實', 0.79569775), ('會覺', 0.788269), ('以為', 0.78638524)]
%%time
most_similar_v2("覺得", topn=3)
CPU times: user 29.3 s, sys: 692 ms, total: 30 s
Wall time: 29.7 s
[('其實', 0.79569775), ('會覺', 0.788269), ('以為', 0.78638524)]
Defining Nodes for the Network#
Extract top 1000 similar words for each near-syn
These top 1000 context words will form the basis for the nodes of the network
%%time
near_syn_topn = dict([(w, most_similar_v1(w, topn=1000)) for w in near_syns])
CPU times: user 1min 8s, sys: 619 ms, total: 1min 9s
Wall time: 1min 9s
Top 10 similar words for each synonym in the list.
For example, the top 10 similar words for 覺得:
near_syn_topn[near_syns[0]][:10]
[('其實', 0.79569775),
('會覺', 0.788269),
('以為', 0.78638524),
('感覺', 0.7840089),
('看來', 0.7798325),
('畢竟', 0.7633344),
('因為', 0.7625315),
('討厭', 0.74918205),
('總覺', 0.743788),
('們覺', 0.74213)]
Convert the tuples into a list, which is easier to be imported into the graph structure.
near_syn_topn_list = []
for w, s in near_syn_topn.items():
for s_w, s_s in s:
near_syn_topn_list.append((w, s_w, s_s))
print(near_syn_topn_list[:10])
print(len(near_syn_topn_list))
[('覺得', '其實', 0.79569775), ('覺得', '會覺', 0.788269), ('覺得', '以為', 0.78638524), ('覺得', '感覺', 0.7840089), ('覺得', '看來', 0.7798325), ('覺得', '畢竟', 0.7633344), ('覺得', '因為', 0.7625315), ('覺得', '討厭', 0.74918205), ('覺得', '總覺', 0.743788), ('覺得', '們覺', 0.74213)]
10000
import pandas as pd
df = pd.DataFrame(near_syn_topn_list,columns=['w1','w2','sim'])
df[df['sim']>0.6]
w1 | w2 | sim | |
---|---|---|---|
0 | 覺得 | 其實 | 0.795698 |
1 | 覺得 | 會覺 | 0.788269 |
2 | 覺得 | 以為 | 0.786385 |
3 | 覺得 | 感覺 | 0.784009 |
4 | 覺得 | 看來 | 0.779832 |
... | ... | ... | ... |
9069 | 主張 | 方針 | 0.602148 |
9070 | 主張 | 這與 | 0.601582 |
9071 | 主張 | 迴避 | 0.601496 |
9072 | 主張 | 體制 | 0.601390 |
9073 | 主張 | 意見 | 0.601044 |
669 rows × 3 columns
Define Connections in-between Nodes#
While context nodes have already had connections (i.e., edges) to the key nodes (i.e., near-syns), these context nodes may themselves be inter-connected due to their semantic similarity
We again utilize the
spacy
language model to determine their semantic similarities.These similarities serve as the basis for the edges of the network
We first identify all potential nodes for the network and then compute their pairwise similarities based on spacy
Glove embeddings.
nodes_id
: include all the possible nodes of the graph.edges_df
: include all the context-key and context-context edges of the graph.
WORD_SIMILARITY_CUTOFF = 0.65 # collexemes and target words
df2 = df[df['sim'] > WORD_SIMILARITY_CUTOFF]
nodes_id = list(set(list(df2['w2'].values) + list(df2['w1'].values)))
# nodes_similarities = pairwise_similarity(nodes_id, nlp_zh)
# nodes_similarities_df = pd.DataFrame(nodes_similarities, index=nodes_id,columns=nodes_id)
# nodes_similarities_df
## Creating nodes pairwise similarity matrix
print(len(nodes_id))
m = len(nodes_id)
distances = np.zeros((m,m))
for i in range(m):
for j in range(m):
distances[i,j] = nlp_zh.vocab[nodes_id[i]].similarity(nlp_zh.vocab[nodes_id[j]])
## Flattening the matrix
EMBEDDING_CUTOFF = 0.75
#print(node_names)
distances_flat = []
for i in range(m):
for j in range(m):
if distances[i,j]> EMBEDDING_CUTOFF and i != j:
distances_flat.append((nodes_id[i], nodes_id[j], distances[i,j]))
edges_df = pd.DataFrame(distances_flat, columns=['w1','w2','sim'])
print(edges_df.shape)
180
(582, 3)
We then combine the context-key edges with the context-context edges. These edges are the final edges for the graph.
edges_df = edges_df.append(df2).drop_duplicates()
print(edges_df.shape)
edges_df.loc[100:120,:]
(785, 3)
w1 | w2 | sim | |
---|---|---|---|
100 | 這樣 | 因為 | 0.799578 |
101 | 這樣 | 還是 | 0.754334 |
102 | 這樣 | 同樣 | 0.791491 |
103 | 這樣 | 這個 | 0.851545 |
104 | 這樣 | 應該 | 0.806186 |
105 | 這樣 | 其實 | 0.807909 |
106 | 這樣 | 畢竟 | 0.771049 |
107 | 這樣 | 這點 | 0.763548 |
108 | 這樣 | 沒有 | 0.756848 |
109 | 這樣 | 當然 | 0.778150 |
110 | 這樣 | 確實 | 0.770117 |
111 | 這樣 | 本來 | 0.769549 |
112 | 這樣 | 這麼 | 0.816209 |
113 | 所謂 | 其實 | 0.751369 |
114 | 所謂 | 並非 | 0.758192 |
115 | 納悶 | 驚訝 | 0.788380 |
116 | 納悶 | 訝異 | 0.826738 |
117 | 還是 | 總之 | 0.761258 |
118 | 還是 | 因為 | 0.808997 |
119 | 還是 | 總是 | 0.821862 |
120 | 還是 | 這樣 | 0.754334 |
Creating a Network#
We use
networkx
to first create a graph and compute relevant node-level metrics, e.g., centralities.We then create two data frames for aesthetic specification of the graph:
nodes_df
edges_df
We use
pyvis
to visualizae the network
import networkx as nx
from pyvis.network import Network
#import pyvis.options as options
#from sklearn.preprocessing import MinMaxScaler
#from scipy.spatial.distance import cosine
#G = nx.Graph()
## A function to rescale metrics for plotting
def myRescaler(x):
x = np.array(x)
y = np.interp(x, (x.min(), x.max()), (5, 20))
return list(y)
Create nodes_df
.
G= nx.from_pandas_edgelist(edges_df, 'w1','w2','sim')
nodes_df = pd.DataFrame({'id':list(G.nodes),
'betweenness': myRescaler(list(nx.betweenness_centrality(G).values())),
'eigenvector': myRescaler(list(nx.eigenvector_centrality(G).values()))})
nodes_df['size']=[5 if i not in near_syns else 10 for i in nodes_id]
nodes_df['size2']= [i if i not in near_syns else 30 for i in nodes_df['eigenvector']]
nodes_df['group'] = ['KEY' if nodes_df.loc[i,'id'] in near_syns else 'CONTEXT' for i in range(nodes_df.shape[0])]
nodes_df['color'] = ['lightpink' if nodes_df.loc[i,'group']=='KEY' else 'lightblue' for i in range(nodes_df.shape[0])]
nodes_df['borderWidthSelected'] = list(np.repeat(20.0, nodes_df.shape[0]))
Visualizing a Network#
Plotting the network using pyvis
.
Gvis = Network("768px","1600px", notebook=False,heading="Semantic Network")
# # Gvis.from_nx(G)
edges_in = list(edges_df.to_records(index=False))
#Gvis.add_nodes(list(G.nodes), value=nodes_df['size2'], color=nodes_df['color'], borderWidthSelected = nodes_df['borderWidthSelected'])
for i in range(nodes_df.shape[0]):
Gvis.add_node(list(G.nodes)[i], value=nodes_df.loc[i,'size2'], group=nodes_df.loc[i,'group'])#, color=nodes_df.loc[i,'color'], borderWidthSelected = nodes_df.loc[i,'borderWidthSelected'])
Gvis.add_edges(edges_in)
#Gvis.show_buttons()
Gvis.set_options("""
var options = {
"nodes": {
"borderWidth": 0,
"color": {
"highlight": {
"border": "rgba(221,171,197,1)",
"background": "rgba(248,178,255,1)"
}
},
"shadow": {
"enabled": true
}
},
"edges": {
"color": {
"highlight": "rgba(255,192,200,1)",
"inherit": false
},
"smooth": false
},
"interaction": {
"hover": true,
"navigationButtons": true
},
"manipulation": {
"enabled": true
},
"physics": {
"barnesHut": {
"springLength": 270
},
"minVelocity": 0.75
}
}
""")
# groups: {
# myGroup: {color:{background:'red'}, borderWidth:3}
# }
Gvis.show(DEMO_DATA_ROOT + '/reporting_verbs_chinese_Gvis.html')
edges_df.to_pickle(DEMO_DATA_ROOT+'/reporting_verbs_chinese_edges_df.pickle')