Network Analysis#

DEMO_DATA_ROOT = "../../../RepositoryData/data"
# from google.colab import drive
# drive.mount('/content/drive')

Word Similarities from Embeddings#

If necessary, install spacy and the Chinese language model zh_core_web_lg (glove embeddings).

# !pip install spacy==2.3
# !spacy download zh_core_web_lg
# !pip install pyvis

Load the packages.

import spacy
import numpy as np
import pandas as pd
import unicodedata
nlp_zh = spacy.load('zh_core_web_lg')

near_syns = ['覺得','認為','宣稱','表示','強調','顯示', '說明','指出','提出','主張']

Inspect the word vectors matrix from the spacy model.

glove_word_vectors = nlp_zh.vocab.vectors
print('Spacy GloVe word vectors Shape: (vocab_size, embedding_dim)',glove_word_vectors.shape)
len(glove_word_vectors)
Spacy GloVe word vectors Shape: (vocab_size, embedding_dim) (500000, 300)
500000

Pairwise similarities of the words in the near-syns

w1 = nlp_zh.vocab['認為']
w2 = nlp_zh.vocab['覺得']

# pairwise similarities of near-syns


def pairwise_similarity(word_list, nlp):
    word_sim_matrix = np.ones(shape=(len(word_list),len(word_list)))
    for i, w1 in enumerate(word_list):
        #print(str(i) + ' '+w1)
        for j, w2 in enumerate(word_list):
            if w1 !=w2:
                word_sim_matrix[i,j] = nlp.vocab[str(w1)].similarity(nlp.vocab[str(w2)])
    return(word_sim_matrix)
        
pd.DataFrame(data= np.round(pairwise_similarity(near_syns, nlp_zh),2),
             index=near_syns,
             columns=near_syns)
覺得 認為 宣稱 表示 強調 顯示 說明 指出 提出 主張
覺得 1.00 0.69 0.47 0.18 0.55 0.37 0.48 0.16 0.01 0.46
認為 0.69 1.00 0.72 0.39 0.77 0.58 0.59 0.45 0.19 0.73
宣稱 0.47 0.72 1.00 0.38 0.71 0.58 0.60 0.38 0.20 0.70
表示 0.18 0.39 0.38 1.00 0.43 0.27 0.23 0.78 0.48 0.22
強調 0.55 0.77 0.71 0.43 1.00 0.55 0.69 0.49 0.28 0.73
顯示 0.37 0.58 0.58 0.27 0.55 1.00 0.62 0.30 0.06 0.37
說明 0.48 0.59 0.60 0.23 0.69 0.62 1.00 0.29 0.19 0.54
指出 0.16 0.45 0.38 0.78 0.49 0.30 0.29 1.00 0.62 0.27
提出 0.01 0.19 0.20 0.48 0.28 0.06 0.19 0.62 1.00 0.27
主張 0.46 0.73 0.70 0.22 0.73 0.37 0.54 0.27 0.27 1.00

To reduce the computation cost, extract the vocabulary of the Chinense model by excluding:

  • ascii characters

  • digits

  • punctuations

And also, consider only two-character words.

vocab = list(nlp_zh.vocab.strings)
#vocab = [w.text for w in nlp_zh.vocab if np.count_nonzero(w.vector) and not w.is_ascii and not w.is_punct]
print(len(vocab))
print(vocab[20000:20200])
544337
['2022', '022', '乌拉特后旗', '特后旗', '温差', '湘潭', 'play', '留學', '索取', '透明度', '孤立', '伊始', '安全法', '故居', '中医院', '番茄', '07月', '历任', '預算', '十字', '手柄', '利润率', '133', '涛', 'Office', '宝博', '企稳', '加�', '代辦', '紧缺', '重现', '冲着', '大利', '播种', '随手', '克什克腾旗', '克腾旗', "'s", "'x", '换来', '受灾', '亮眼', '峦�', '峦', '计数', '操穴', '100米', '00米', '展品', '帶動', '前任', 'a站', '表率', '社科', '供奉', '安检', '吉泽明', '学生会', '三线', '清凉', '取暖', '隐蔽', '无所谓', '不在乎', '粗大', '串联', '切尔西', '時光', '增殖', '宜宾市', '溫暖', '燕子', '燕', '后天', '冒出', '權力', '倫敦', '波司登', '胜地', '值当', '康健', '协和', '朴素', '胸口', '樱花', '樱', '孔明', '少许', '嵌入', '镍', '掘金', '掘', '推�', '项链', '包赢', '制作人', '增产', '交流区', '妆品', '妆', '温恒', '未婚', '非金属', '事前', '台账', '强强', '银行家', '大树', '小哥', '纱', '肤色', '肤', '陡然', '陡', '打水', '電源', '项目部', '樂團', '兩位', '来不及', '邻家', '外星人', '黄网站', '南充', '市直', '带入', '電影院', '摔倒', '礼服', '建造师', '5', '自拍区', '贯通', '沿岸', '透玩', 'LOGO', 'logo', 'OGO', '他家', '领空', '稀少', '13%', '山林', '频w', '算单', '田野', '猜想', '這裏', '增強', '文山', '不俗', '收費', '配电', '利害', '萌', '捡', '开播', '依规', '深知', '株洲', '产视', '\x07\x06\x05', '玩场', '事務', '常州市', 'MB', 'mb', '一早', '乐网', '祖母', '二季度', 'AV天堂', 'V天堂', 'XXxx', '石川', '解散', '天国', '开房', '野狼', '法语', '承德市', '赶来', '光临', '涉事', '著作权', '老人们', '怀化', '节水', '米兜', '腰部', '尤物', '频自', '黑洞', '编导', '永利网', '乾淨', '戀愛', '戀', '私阴', '脱水']

For each near-syn, we should find the word similarities between the near-syn and all the other words in the NLP vocabulary.

Take the first near-syn for example.

%%time

target_word = '覺得'
word_sim = []
# check each word in vocab its simi with target_Word

target_word_vocab = nlp_zh.vocab[target_word]
for w in vocab:
    w_vocab = nlp_zh.vocab[w]
    if w_vocab.vector is not None and np.count_nonzero(w_vocab.vector) and not w_vocab.is_ascii and not w_vocab.is_punct and w!=target_word:
        word_sim.append((w, target_word_vocab.similarity(w_vocab)))
CPU times: user 42.6 s, sys: 761 ms, total: 43.3 s
Wall time: 42.9 s

Extract the top 10 words that are similar to the first near syn.

sorted(word_sim, key=lambda x:x[1], reverse=True)[:10]
[('覺', 0.84788847),
 ('其實', 0.79569775),
 ('會覺', 0.788269),
 ('以為', 0.78638524),
 ('感覺', 0.7840089),
 ('看來', 0.7798325),
 ('畢竟', 0.7633344),
 ('看起來', 0.7629494),
 ('因為', 0.7625315),
 ('討厭', 0.74918205)]

Each vocab has several properties defined in spacy that are useful for filtering irrelevant words before computing the word similarities

Note

The Chinese spacy language model does not seem to include the word probability information.

#w.is_lower == word.is_lower and w.prob >= -15
w1 = nlp_zh.vocab['覺得']
w2 = nlp_zh.vocab['ship']

print(w2.is_ascii)
print(w2.is_currency)
print(w2.is_punct)
True
False
False

Define functions to extract top-N similar words

import numba
from numba import jit

@jit(nopython=True)
def cosine_similarity_numba(u:np.ndarray, v:np.ndarray):
    assert(u.shape[0] == v.shape[0])
    uv = 0
    uu = 0
    vv = 0
    for i in range(u.shape[0]):
        uv += u[i]*v[i]
        uu += u[i]*u[i]
        vv += v[i]*v[i]
    cos_theta = 1
    if uu != 0 and vv != 0:
        cos_theta = uv/np.sqrt(uu*vv)
    return cos_theta
## Efficient version
def most_similar_v1(word, topn=5):
  word = nlp_zh.vocab[str(word)]
  queries = [
      w for w in nlp_zh.vocab 
      if np.count_nonzero(w.vector) and not w.is_ascii and not w.is_punct and len(w.text)==2
  ]

  #by_similarity = sorted(queries, key=lambda w: word.similarity(w), reverse=True)

  by_similarity = sorted(queries, key=lambda w: cosine_similarity_numba(w.vector, word.vector), reverse=True)
    
    
  return [(w.text,w.similarity(word)) for w in by_similarity[:topn+1] if w.text != word.text]
## Naive version

def most_similar_v2(word, topn=5):
  word = nlp_zh.vocab[str(word)]
  queries = [
      w for w in nlp_zh.vocab 
      if np.count_nonzero(w.vector) and not w.is_ascii and not w.is_punct and len(w.text)==2
  ]

  by_similarity = sorted(queries, key=lambda w: word.similarity(w), reverse=True)
  #by_similarity = sorted(queries, key=lambda w: cosine_similarity_numba(w.vector, word.vector), reverse=True)

  return [(w.text,w.similarity(word)) for w in by_similarity[:topn+1] if w.text != word.text]

Test the time needed in different versions

%%time
most_similar_v1("覺得", topn=3)
CPU times: user 7.42 s, sys: 309 ms, total: 7.73 s
Wall time: 14.9 s
[('其實', 0.79569775), ('會覺', 0.788269), ('以為', 0.78638524)]
%%time
most_similar_v2("覺得", topn=3)
CPU times: user 29.3 s, sys: 692 ms, total: 30 s
Wall time: 29.7 s
[('其實', 0.79569775), ('會覺', 0.788269), ('以為', 0.78638524)]

Defining Nodes for the Network#

  • Extract top 1000 similar words for each near-syn

  • These top 1000 context words will form the basis for the nodes of the network

%%time
near_syn_topn = dict([(w, most_similar_v1(w, topn=1000)) for w in near_syns])
CPU times: user 1min 8s, sys: 619 ms, total: 1min 9s
Wall time: 1min 9s

Top 10 similar words for each synonym in the list.

For example, the top 10 similar words for 覺得:

near_syn_topn[near_syns[0]][:10]
[('其實', 0.79569775),
 ('會覺', 0.788269),
 ('以為', 0.78638524),
 ('感覺', 0.7840089),
 ('看來', 0.7798325),
 ('畢竟', 0.7633344),
 ('因為', 0.7625315),
 ('討厭', 0.74918205),
 ('總覺', 0.743788),
 ('們覺', 0.74213)]

Convert the tuples into a list, which is easier to be imported into the graph structure.

near_syn_topn_list = []
for w, s in near_syn_topn.items():
    for s_w, s_s in s:
        near_syn_topn_list.append((w, s_w, s_s))
print(near_syn_topn_list[:10])
print(len(near_syn_topn_list))
[('覺得', '其實', 0.79569775), ('覺得', '會覺', 0.788269), ('覺得', '以為', 0.78638524), ('覺得', '感覺', 0.7840089), ('覺得', '看來', 0.7798325), ('覺得', '畢竟', 0.7633344), ('覺得', '因為', 0.7625315), ('覺得', '討厭', 0.74918205), ('覺得', '總覺', 0.743788), ('覺得', '們覺', 0.74213)]
10000
import pandas as pd
df = pd.DataFrame(near_syn_topn_list,columns=['w1','w2','sim'])
df[df['sim']>0.6]
w1 w2 sim
0 覺得 其實 0.795698
1 覺得 會覺 0.788269
2 覺得 以為 0.786385
3 覺得 感覺 0.784009
4 覺得 看來 0.779832
... ... ... ...
9069 主張 方針 0.602148
9070 主張 這與 0.601582
9071 主張 迴避 0.601496
9072 主張 體制 0.601390
9073 主張 意見 0.601044

669 rows × 3 columns

Define Connections in-between Nodes#

  • While context nodes have already had connections (i.e., edges) to the key nodes (i.e., near-syns), these context nodes may themselves be inter-connected due to their semantic similarity

  • We again utilize the spacy language model to determine their semantic similarities.

  • These similarities serve as the basis for the edges of the network

We first identify all potential nodes for the network and then compute their pairwise similarities based on spacy Glove embeddings.

  • nodes_id: include all the possible nodes of the graph.

  • edges_df: include all the context-key and context-context edges of the graph.

WORD_SIMILARITY_CUTOFF = 0.65 # collexemes and target words
df2 = df[df['sim'] > WORD_SIMILARITY_CUTOFF]
nodes_id = list(set(list(df2['w2'].values) + list(df2['w1'].values)))

# nodes_similarities = pairwise_similarity(nodes_id, nlp_zh)
# nodes_similarities_df = pd.DataFrame(nodes_similarities, index=nodes_id,columns=nodes_id)
# nodes_similarities_df
## Creating nodes pairwise similarity matrix
print(len(nodes_id))
m = len(nodes_id)
distances = np.zeros((m,m))

for i in range(m):
    for j in range(m):  
        distances[i,j] = nlp_zh.vocab[nodes_id[i]].similarity(nlp_zh.vocab[nodes_id[j]])

## Flattening the matrix
EMBEDDING_CUTOFF = 0.75

#print(node_names)
distances_flat = []

for i in range(m):
    for j in range(m):
        if distances[i,j]> EMBEDDING_CUTOFF and i != j:
            distances_flat.append((nodes_id[i], nodes_id[j], distances[i,j]))

edges_df = pd.DataFrame(distances_flat, columns=['w1','w2','sim'])
print(edges_df.shape)
180
(582, 3)

We then combine the context-key edges with the context-context edges. These edges are the final edges for the graph.

edges_df = edges_df.append(df2).drop_duplicates()
print(edges_df.shape)
edges_df.loc[100:120,:]
(785, 3)
w1 w2 sim
100 這樣 因為 0.799578
101 這樣 還是 0.754334
102 這樣 同樣 0.791491
103 這樣 這個 0.851545
104 這樣 應該 0.806186
105 這樣 其實 0.807909
106 這樣 畢竟 0.771049
107 這樣 這點 0.763548
108 這樣 沒有 0.756848
109 這樣 當然 0.778150
110 這樣 確實 0.770117
111 這樣 本來 0.769549
112 這樣 這麼 0.816209
113 所謂 其實 0.751369
114 所謂 並非 0.758192
115 納悶 驚訝 0.788380
116 納悶 訝異 0.826738
117 還是 總之 0.761258
118 還是 因為 0.808997
119 還是 總是 0.821862
120 還是 這樣 0.754334

Creating a Network#

  • We use networkx to first create a graph and compute relevant node-level metrics, e.g., centralities.

  • We then create two data frames for aesthetic specification of the graph:

    • nodes_df

    • edges_df

  • We use pyvis to visualizae the network

import networkx as nx
from pyvis.network import Network
#import pyvis.options as options
#from sklearn.preprocessing import MinMaxScaler
#from scipy.spatial.distance import cosine
#G = nx.Graph()
## A function to rescale metrics for plotting
def myRescaler(x):
    x = np.array(x)
    y = np.interp(x, (x.min(), x.max()), (5, 20))
    return list(y)

Create nodes_df.

G= nx.from_pandas_edgelist(edges_df, 'w1','w2','sim')

nodes_df = pd.DataFrame({'id':list(G.nodes),
                         'betweenness': myRescaler(list(nx.betweenness_centrality(G).values())),
                         'eigenvector': myRescaler(list(nx.eigenvector_centrality(G).values()))})
nodes_df['size']=[5 if i not in near_syns else 10 for i in nodes_id]
nodes_df['size2']= [i if i not in near_syns else 30 for i in nodes_df['eigenvector']]
nodes_df['group'] = ['KEY' if nodes_df.loc[i,'id'] in near_syns else 'CONTEXT' for i in range(nodes_df.shape[0])]
nodes_df['color'] = ['lightpink' if nodes_df.loc[i,'group']=='KEY' else 'lightblue' for i in range(nodes_df.shape[0])]
nodes_df['borderWidthSelected'] = list(np.repeat(20.0, nodes_df.shape[0]))

Visualizing a Network#

Plotting the network using pyvis.

Gvis = Network("768px","1600px", notebook=False,heading="Semantic Network")
# # Gvis.from_nx(G)
edges_in = list(edges_df.to_records(index=False))
#Gvis.add_nodes(list(G.nodes), value=nodes_df['size2'], color=nodes_df['color'], borderWidthSelected = nodes_df['borderWidthSelected'])

for i in range(nodes_df.shape[0]):
  Gvis.add_node(list(G.nodes)[i], value=nodes_df.loc[i,'size2'], group=nodes_df.loc[i,'group'])#, color=nodes_df.loc[i,'color'], borderWidthSelected = nodes_df.loc[i,'borderWidthSelected'])

Gvis.add_edges(edges_in)
#Gvis.show_buttons()
Gvis.set_options("""
  var options = {
    "nodes": {
      "borderWidth": 0,
      "color": {
        "highlight": {
          "border": "rgba(221,171,197,1)",
          "background": "rgba(248,178,255,1)"
        }
      },
      "shadow": {
        "enabled": true
      }
    },
    "edges": {
      "color": {
        "highlight": "rgba(255,192,200,1)",
        "inherit": false
      },
      "smooth": false
    },
    "interaction": {
      "hover": true,
      "navigationButtons": true
    },
    "manipulation": {
      "enabled": true
    },
    "physics": {
      "barnesHut": {
        "springLength": 270
      },
      "minVelocity": 0.75
    }
  }
""")


  # groups: {
  #   myGroup: {color:{background:'red'}, borderWidth:3}
  # }
Gvis.show(DEMO_DATA_ROOT + '/reporting_verbs_chinese_Gvis.html')
edges_df.to_pickle(DEMO_DATA_ROOT+'/reporting_verbs_chinese_edges_df.pickle')

References#