Network Analysis

Network Analysis#

DEMO_DATA_ROOT = "../../../RepositoryData/data"

# from google.colab import drive
# drive.mount('/content/drive')

Word Similarities from Embeddings#

If necessary, install spacy and the Chinese language model zh_core_web_lg (glove embeddings).

# !pip install spacy==2.3
# !spacy download zh_core_web_lg
# !pip install pyvis

Load the packages.

import spacy
import numpy as np
import pandas as pd
import unicodedata
nlp_zh = spacy.load('zh_core_web_lg')

near_syns = ['覺得','認為','宣稱','表示','強調','顯示', '說明','指出','提出','主張']

Inspect the word vectors matrix from the spacy model.

glove_word_vectors = nlp_zh.vocab.vectors
print('Spacy GloVe word vectors Shape: (vocab_size, embedding_dim)',glove_word_vectors.shape)
len(glove_word_vectors)

Spacy GloVe word vectors Shape: (vocab_size, embedding_dim) (500000, 300)

Pairwise similarities of the words in the near-syns

w1 = nlp_zh.vocab['認為']
w2 = nlp_zh.vocab['覺得']

# pairwise similarities of near-syns


def pairwise_similarity(word_list, nlp):
    word_sim_matrix = np.ones(shape=(len(word_list),len(word_list)))
    for i, w1 in enumerate(word_list):
        #print(str(i) + ' '+w1)
        for j, w2 in enumerate(word_list):
            if w1 !=w2:
                word_sim_matrix[i,j] = nlp.vocab[str(w1)].similarity(nlp.vocab[str(w2)])
    return(word_sim_matrix)
        
pd.DataFrame(data= np.round(pairwise_similarity(near_syns, nlp_zh),2),
             index=near_syns,
             columns=near_syns)

	覺得	認為	宣稱	表示	強調	顯示	說明	指出	提出	主張
覺得	1.00	0.69	0.47	0.18	0.55	0.37	0.48	0.16	0.01	0.46
認為	0.69	1.00	0.72	0.39	0.77	0.58	0.59	0.45	0.19	0.73
宣稱	0.47	0.72	1.00	0.38	0.71	0.58	0.60	0.38	0.20	0.70
表示	0.18	0.39	0.38	1.00	0.43	0.27	0.23	0.78	0.48	0.22
強調	0.55	0.77	0.71	0.43	1.00	0.55	0.69	0.49	0.28	0.73
顯示	0.37	0.58	0.58	0.27	0.55	1.00	0.62	0.30	0.06	0.37
說明	0.48	0.59	0.60	0.23	0.69	0.62	1.00	0.29	0.19	0.54
指出	0.16	0.45	0.38	0.78	0.49	0.30	0.29	1.00	0.62	0.27
提出	0.01	0.19	0.20	0.48	0.28	0.06	0.19	0.62	1.00	0.27
主張	0.46	0.73	0.70	0.22	0.73	0.37	0.54	0.27	0.27	1.00

To reduce the computation cost, extract the vocabulary of the Chinense model by excluding:

ascii characters
digits
punctuations

And also, consider only two-character words.

vocab = list(nlp_zh.vocab.strings)
#vocab = [w.text for w in nlp_zh.vocab if np.count_nonzero(w.vector) and not w.is_ascii and not w.is_punct]
print(len(vocab))
print(vocab[20000:20200])

544337
['2022', '022', '乌拉特后旗', '特后旗', '温差', '湘潭', 'play', '留學', '索取', '透明度', '孤立', '伊始', '安全法', '故居', '中医院', '番茄', '07月', '历任', '預算', '十字', '手柄', '利润率', '133', '涛', 'Office', '宝博', '企稳', '加�', '代辦', '紧缺', '重现', '冲着', '大利', '播种', '随手', '克什克腾旗', '克腾旗', "'s", "'x", '换来', '受灾', '亮眼', '峦�', '峦', '计数', '操穴', '100米', '00米', '展品', '帶動', '前任', 'a站', '表率', '社科', '供奉', '安检', '吉泽明', '学生会', '三线', '清凉', '取暖', '隐蔽', '无所谓', '不在乎', '粗大', '串联', '切尔西', '時光', '增殖', '宜宾市', '溫暖', '燕子', '燕', '后天', '冒出', '權力', '倫敦', '波司登', '胜地', '值当', '康健', '协和', '朴素', '胸口', '樱花', '樱', '孔明', '少许', '嵌入', '镍', '掘金', '掘', '推�', '项链', '包赢', '制作人', '增产', '交流区', '妆品', '妆', '温恒', '未婚', '非金属', '事前', '台账', '强强', '银行家', '大树', '小哥', '纱', '肤色', '肤', '陡然', '陡', '打水', '電源', '项目部', '樂團', '兩位', '来不及', '邻家', '外星人', '黄网站', '南充', '市直', '带入', '電影院', '摔倒', '礼服', '建造师', '５', '自拍区', '贯通', '沿岸', '透玩', 'LOGO', 'logo', 'OGO', '他家', '领空', '稀少', '13%', '山林', '频w', '算单', '田野', '猜想', '這裏', '增強', '文山', '不俗', '收費', '配电', '利害', '萌', '捡', '开播', '依规', '深知', '株洲', '产视', '\x07\x06\x05', '玩场', '事務', '常州市', 'MB', 'mb', '一早', '乐网', '祖母', '二季度', 'AV天堂', 'V天堂', 'XXxx', '石川', '解散', '天国', '开房', '野狼', '法语', '承德市', '赶来', '光临', '涉事', '著作权', '老人们', '怀化', '节水', '米兜', '腰部', '尤物', '频自', '黑洞', '编导', '永利网', '乾淨', '戀愛', '戀', '私阴', '脱水']

For each near-syn, we should find the word similarities between the near-syn and all the other words in the NLP vocabulary.

Take the first near-syn for example.

%%time

target_word = '覺得'
word_sim = []
# check each word in vocab its simi with target_Word

target_word_vocab = nlp_zh.vocab[target_word]
for w in vocab:
    w_vocab = nlp_zh.vocab[w]
    if w_vocab.vector is not None and np.count_nonzero(w_vocab.vector) and not w_vocab.is_ascii and not w_vocab.is_punct and w!=target_word:
        word_sim.append((w, target_word_vocab.similarity(w_vocab)))

CPU times: user 42.6 s, sys: 761 ms, total: 43.3 s
Wall time: 42.9 s

Extract the top 10 words that are similar to the first near syn.

sorted(word_sim, key=lambda x:x[1], reverse=True)[:10]

[('覺', 0.84788847),
 ('其實', 0.79569775),
 ('會覺', 0.788269),
 ('以為', 0.78638524),
 ('感覺', 0.7840089),
 ('看來', 0.7798325),
 ('畢竟', 0.7633344),
 ('看起來', 0.7629494),
 ('因為', 0.7625315),
 ('討厭', 0.74918205)]

Each vocab has several properties defined in spacy that are useful for filtering irrelevant words before computing the word similarities

Note

The Chinese spacy language model does not seem to include the word probability information.

#w.is_lower == word.is_lower and w.prob >= -15
w1 = nlp_zh.vocab['覺得']
w2 = nlp_zh.vocab['ship']

print(w2.is_ascii)
print(w2.is_currency)
print(w2.is_punct)

True
False
False

Define functions to extract top-N similar words

Functions taken from this SO discussion thread
Deal with the computation efficiency problems (big matrices)

import numba
from numba import jit

@jit(nopython=True)
def cosine_similarity_numba(u:np.ndarray, v:np.ndarray):
    assert(u.shape[0] == v.shape[0])
    uv = 0
    uu = 0
    vv = 0
    for i in range(u.shape[0]):
        uv += u[i]*v[i]
        uu += u[i]*u[i]
        vv += v[i]*v[i]
    cos_theta = 1
    if uu != 0 and vv != 0:
        cos_theta = uv/np.sqrt(uu*vv)
    return cos_theta

## Efficient version
def most_similar_v1(word, topn=5):
  word = nlp_zh.vocab[str(word)]
  queries = [
      w for w in nlp_zh.vocab 
      if np.count_nonzero(w.vector) and not w.is_ascii and not w.is_punct and len(w.text)==2
  ]

  #by_similarity = sorted(queries, key=lambda w: word.similarity(w), reverse=True)

  by_similarity = sorted(queries, key=lambda w: cosine_similarity_numba(w.vector, word.vector), reverse=True)
    
    
  return [(w.text,w.similarity(word)) for w in by_similarity[:topn+1] if w.text != word.text]

## Naive version

def most_similar_v2(word, topn=5):
  word = nlp_zh.vocab[str(word)]
  queries = [
      w for w in nlp_zh.vocab 
      if np.count_nonzero(w.vector) and not w.is_ascii and not w.is_punct and len(w.text)==2
  ]

  by_similarity = sorted(queries, key=lambda w: word.similarity(w), reverse=True)
  #by_similarity = sorted(queries, key=lambda w: cosine_similarity_numba(w.vector, word.vector), reverse=True)

  return [(w.text,w.similarity(word)) for w in by_similarity[:topn+1] if w.text != word.text]

Test the time needed in different versions

%%time
most_similar_v1("覺得", topn=3)

CPU times: user 7.42 s, sys: 309 ms, total: 7.73 s
Wall time: 14.9 s

[('其實', 0.79569775), ('會覺', 0.788269), ('以為', 0.78638524)]

%%time
most_similar_v2("覺得", topn=3)

CPU times: user 29.3 s, sys: 692 ms, total: 30 s
Wall time: 29.7 s

[('其實', 0.79569775), ('會覺', 0.788269), ('以為', 0.78638524)]

Defining Nodes for the Network#

Extract top 1000 similar words for each near-syn
These top 1000 context words will form the basis for the nodes of the network

%%time
near_syn_topn = dict([(w, most_similar_v1(w, topn=1000)) for w in near_syns])

CPU times: user 1min 8s, sys: 619 ms, total: 1min 9s
Wall time: 1min 9s

Top 10 similar words for each synonym in the list.

For example, the top 10 similar words for 覺得:

near_syn_topn[near_syns[0]][:10]

[('其實', 0.79569775),
 ('會覺', 0.788269),
 ('以為', 0.78638524),
 ('感覺', 0.7840089),
 ('看來', 0.7798325),
 ('畢竟', 0.7633344),
 ('因為', 0.7625315),
 ('討厭', 0.74918205),
 ('總覺', 0.743788),
 ('們覺', 0.74213)]

Convert the tuples into a list, which is easier to be imported into the graph structure.

near_syn_topn_list = []
for w, s in near_syn_topn.items():
    for s_w, s_s in s:
        near_syn_topn_list.append((w, s_w, s_s))

print(near_syn_topn_list[:10])
print(len(near_syn_topn_list))

[('覺得', '其實', 0.79569775), ('覺得', '會覺', 0.788269), ('覺得', '以為', 0.78638524), ('覺得', '感覺', 0.7840089), ('覺得', '看來', 0.7798325), ('覺得', '畢竟', 0.7633344), ('覺得', '因為', 0.7625315), ('覺得', '討厭', 0.74918205), ('覺得', '總覺', 0.743788), ('覺得', '們覺', 0.74213)]
10000

import pandas as pd
df = pd.DataFrame(near_syn_topn_list,columns=['w1','w2','sim'])
df[df['sim']>0.6]

	w1	w2	sim
0	覺得	其實	0.795698
1	覺得	會覺	0.788269
2	覺得	以為	0.786385
3	覺得	感覺	0.784009
4	覺得	看來	0.779832
...	...	...	...
9069	主張	方針	0.602148
9070	主張	這與	0.601582
9071	主張	迴避	0.601496
9072	主張	體制	0.601390
9073	主張	意見	0.601044

669 rows × 3 columns

Define Connections in-between Nodes#

While context nodes have already had connections (i.e., edges) to the key nodes (i.e., near-syns), these context nodes may themselves be inter-connected due to their semantic similarity
We again utilize the spacy language model to determine their semantic similarities.
These similarities serve as the basis for the edges of the network

We first identify all potential nodes for the network and then compute their pairwise similarities based on spacy Glove embeddings.

nodes_id: include all the possible nodes of the graph.
edges_df: include all the context-key and context-context edges of the graph.

WORD_SIMILARITY_CUTOFF = 0.65 # collexemes and target words
df2 = df[df['sim'] > WORD_SIMILARITY_CUTOFF]
nodes_id = list(set(list(df2['w2'].values) + list(df2['w1'].values)))

# nodes_similarities = pairwise_similarity(nodes_id, nlp_zh)
# nodes_similarities_df = pd.DataFrame(nodes_similarities, index=nodes_id,columns=nodes_id)
# nodes_similarities_df

## Creating nodes pairwise similarity matrix
print(len(nodes_id))
m = len(nodes_id)
distances = np.zeros((m,m))

for i in range(m):
    for j in range(m):  
        distances[i,j] = nlp_zh.vocab[nodes_id[i]].similarity(nlp_zh.vocab[nodes_id[j]])

## Flattening the matrix
EMBEDDING_CUTOFF = 0.75

#print(node_names)
distances_flat = []

for i in range(m):
    for j in range(m):
        if distances[i,j]> EMBEDDING_CUTOFF and i != j:
            distances_flat.append((nodes_id[i], nodes_id[j], distances[i,j]))

edges_df = pd.DataFrame(distances_flat, columns=['w1','w2','sim'])
print(edges_df.shape)

180
(582, 3)

We then combine the context-key edges with the context-context edges. These edges are the final edges for the graph.

edges_df = edges_df.append(df2).drop_duplicates()
print(edges_df.shape)
edges_df.loc[100:120,:]

(785, 3)

	w1	w2	sim
100	這樣	因為	0.799578
101	這樣	還是	0.754334
102	這樣	同樣	0.791491
103	這樣	這個	0.851545
104	這樣	應該	0.806186
105	這樣	其實	0.807909
106	這樣	畢竟	0.771049
107	這樣	這點	0.763548
108	這樣	沒有	0.756848
109	這樣	當然	0.778150
110	這樣	確實	0.770117
111	這樣	本來	0.769549
112	這樣	這麼	0.816209
113	所謂	其實	0.751369
114	所謂	並非	0.758192
115	納悶	驚訝	0.788380
116	納悶	訝異	0.826738
117	還是	總之	0.761258
118	還是	因為	0.808997
119	還是	總是	0.821862
120	還是	這樣	0.754334

Creating a Network#

We use networkx to first create a graph and compute relevant node-level metrics, e.g., centralities.
We then create two data frames for aesthetic specification of the graph:
- nodes_df
- edges_df
We use pyvis to visualizae the network

import networkx as nx
from pyvis.network import Network
#import pyvis.options as options
#from sklearn.preprocessing import MinMaxScaler
#from scipy.spatial.distance import cosine
#G = nx.Graph()

## A function to rescale metrics for plotting
def myRescaler(x):
    x = np.array(x)
    y = np.interp(x, (x.min(), x.max()), (5, 20))
    return list(y)

Create nodes_df.

G= nx.from_pandas_edgelist(edges_df, 'w1','w2','sim')

nodes_df = pd.DataFrame({'id':list(G.nodes),
                         'betweenness': myRescaler(list(nx.betweenness_centrality(G).values())),
                         'eigenvector': myRescaler(list(nx.eigenvector_centrality(G).values()))})
nodes_df['size']=[5 if i not in near_syns else 10 for i in nodes_id]
nodes_df['size2']= [i if i not in near_syns else 30 for i in nodes_df['eigenvector']]
nodes_df['group'] = ['KEY' if nodes_df.loc[i,'id'] in near_syns else 'CONTEXT' for i in range(nodes_df.shape[0])]
nodes_df['color'] = ['lightpink' if nodes_df.loc[i,'group']=='KEY' else 'lightblue' for i in range(nodes_df.shape[0])]
nodes_df['borderWidthSelected'] = list(np.repeat(20.0, nodes_df.shape[0]))

Visualizing a Network#

Plotting the network using pyvis.

Gvis = Network("768px","1600px", notebook=False,heading="Semantic Network")
# # Gvis.from_nx(G)
edges_in = list(edges_df.to_records(index=False))
#Gvis.add_nodes(list(G.nodes), value=nodes_df['size2'], color=nodes_df['color'], borderWidthSelected = nodes_df['borderWidthSelected'])

for i in range(nodes_df.shape[0]):
  Gvis.add_node(list(G.nodes)[i], value=nodes_df.loc[i,'size2'], group=nodes_df.loc[i,'group'])#, color=nodes_df.loc[i,'color'], borderWidthSelected = nodes_df.loc[i,'borderWidthSelected'])

Gvis.add_edges(edges_in)
#Gvis.show_buttons()
Gvis.set_options("""
  var options = {
    "nodes": {
      "borderWidth": 0,
      "color": {
        "highlight": {
          "border": "rgba(221,171,197,1)",
          "background": "rgba(248,178,255,1)"
        }
      },
      "shadow": {
        "enabled": true
      }
    },
    "edges": {
      "color": {
        "highlight": "rgba(255,192,200,1)",
        "inherit": false
      },
      "smooth": false
    },
    "interaction": {
      "hover": true,
      "navigationButtons": true
    },
    "manipulation": {
      "enabled": true
    },
    "physics": {
      "barnesHut": {
        "springLength": 270
      },
      "minVelocity": 0.75
    }
  }
""")


  # groups: {
  #   myGroup: {color:{background:'red'}, borderWidth:3}
  # }

Gvis.show(DEMO_DATA_ROOT + '/reporting_verbs_chinese_Gvis.html')
edges_df.to_pickle(DEMO_DATA_ROOT+'/reporting_verbs_chinese_edges_df.pickle')

References#

vis.js Documentation