希拉里邮件门:
# 定义别名转换函数
def unify_name(name):
# 统一name为小写字母
name = str(name).lower()
# 去掉,和; 以及 @ 后面的内容
name = name.replace(',', '')
name = name.replace(';', '').split('@')[0]
# 别名转换
if name in aliases.keys():
#下面这行看不懂 如果emails的名字在aliases的别名表里就把它改成正规persons表里 通过id映射到persions这个表
return persons[aliases[name]]
else:
return name
def show_graph(graph, layout='spring_layout'):
# , circular_layout 圆环状布局
if layout == 'circular_layout':
pos = nx.circular_layout(graph)
else:
pos = nx.spring_layout(graph)
# 设置网络图中的节点大小, *10000 因为 pagerank 值很小
nodesize = [x['pagerank'] * 10000 for v, x in graph.nodes(data=True)]
# 设置网络图中的边长度 用权重衡量
edgesize = [np.sqrt(e[2]['weight']) for e in graph.edges(data=True)]
# 绘制节点
nx.draw_networkx_nodes(graph, pos, node_size=nodesize, alpha=0.4)
# 绘制边
nx.draw_networkx_edges(graph, pos, edge_size=edgesize, alpha=0.2)
# 绘制节点的 label
nx.draw_networkx_labels(graph, pos, font_size=10)
plt.show()
# 将邮件数据中寄件人和收件人的姓名进行规范化
emails.MetadataFrom = emails.MetadataFrom.apply(unify_name)
emails.MetadataTo = emails.MetadataTo.apply(unify_name)
# 设置权重等于发邮件的次数
edges_weights_temp = defaultdict(list)
for row in zip(emails.MetadataFrom, emails.MetadataTo):
temp = (row[0], row[1])
# print(temp)
if temp not in edges_weights_temp:
edges_weights_temp[temp] = 1
else:
edges_weights_temp[temp] = edges_weights_temp[temp] + 1
# print(edges_weights_temp.items())
# for key, val in edges_weights_temp.items():
# print(key,val)
# 转化格式 (from, to), weight => from, to, weight
edges_weights = [(key[0], key[1], val) for key, val in edges_weights_temp.items()]
# edges_weights = [(key[1], key[0], val) for key, val in edges_weights_temp.items()]
# print(edges_weights[:20])
# 创建一个有向图
graph = nx.DiGraph()
# 设置有向图中的路径及权重 (from, to, weight) 应用到微博中是不是点赞的和转发的是from 重点受众人群是to weight是加起来的次数
graph.add_weighted_edges_from(edges_weights)
# print(graph)
# 计算每个节点的PR值并作为节点的pagerank属性
pagerank = nx.pagerank(graph)
#两种方法对pagerank值进行排序
d = list(zip(pagerank.values(), pagerank.keys()))
d = sorted(d)
print(d)
d2 = sorted(pagerank.items(), key=lambda x: x[1])
print(d2)
# nx.set_node_attributes(graph, name='pagerank', values=pagerank)
# # 画图
# show_graph(graph)
#
# # 精简图
# # 设置 PR 值的阈值,筛选大于阈值的重要核心节点
# pagerank_threshold = 0.005
# # 复制一份计算好的图
# small_graph = graph.copy()
# # 删除PR值小于pagerank_threshold的节点
# for n, p_rank in graph.nodes(data=True):
# if p_rank['pagerank'] < pagerank_threshold:
# small_graph.remove_node(n)
# # 画网络图, 采用 circular_layout 布局
# show_graph(small_graph, 'circular_layout')
page1:
# 输入为一个*.txt文件,例如
# A B
# B C
# B A
# ...表示前者指向后者
import numpy as np
if __name__ == '__main__':
# 读入有向图,存储边
f = open('./data/input_1.txt', 'r')
edges = [line.strip('\n').split(' ') for line in f]
print(edges)
# 根据边获取节点的集合
nodes = []
for edge in edges:
if edge[0] not in nodes:
nodes.append(edge[0])
if edge[1] not in nodes:
nodes.append(edge[1])
print(nodes)
N = len(nodes)
# 将节点符号(字母),映射成阿拉伯数字,便于后面生成A矩阵/S矩阵
i = 0
node_to_num = {}
for node in nodes:
node_to_num[node] = i
i += 1
for edge in edges:
edge[0] = node_to_num[edge[0]]
edge[1] = node_to_num[edge[1]]
print(edges)
# 生成初步的S矩阵
S = np.zeros([N, N])
for edge in edges:
S[edge[1], edge[0]] = 1
print(S)
# 计算比例:即一个网页对其他网页的PageRank值的贡献,即进行列的归一化处理
for j in range(N):
sum_of_col = sum(S[:, j])
for i in range(N):
S[i, j] /= sum_of_col
print(S)
# 计算矩阵A
alpha = 0.85
A = alpha * S + (1 - alpha) / N * np.ones([N, N])
print(A)
# 生成初始的PageRank值,记录在P_n中,P_n和P_n1均用于迭代
P_n = np.ones(N) / N
P_n1 = np.zeros(N)
e = 100000 # 误差初始化
k = 0 # 记录迭代次数
print('loop...')
while e > 0.00000001: # 开始迭代
P_n1 = np.dot(A, P_n) # 迭代公式
e = P_n1 - P_n
e = max(map(abs, e)) # 计算误差
P_n = P_n1
k += 1
print('iteration %s:' % str(k), P_n1)
print('final result:', P_n)
page2:
# 输入为一个*.txt文件,例如
# A B
# B C
# B A
# ...表示前者指向后者
import numpy as np
if __name__ == '__main__':
# 读入有向图,存储边
f = open('./data/Wiki-Vote.txt', 'r')
edges = [line.strip('\n').split('\t') for line in f]
print(edges)
# 根据边获取节点的集合
nodes = []
for edge in edges:
if edge[0] not in nodes:
nodes.append(edge[0])
if edge[1] not in nodes:
nodes.append(edge[1])
print(nodes)
N = len(nodes)
# 将节点符号(字母),映射成阿拉伯数字,便于后面生成A矩阵/S矩阵
i = 0
node_to_num = {}
for node in nodes:
node_to_num[node] = i
i += 1
for edge in edges:
edge[0] = node_to_num[edge[0]]
edge[1] = node_to_num[edge[1]]
print(edges)
# 生成初步的S矩阵
S = np.zeros([N, N])
for edge in edges:
S[edge[1], edge[0]] = 1
print(S)
# 计算比例:即一个网页对其他网页的PageRank值的贡献,即进行列的归一化处理
for j in range(N):
sum_of_col = sum(S[:, j])
for i in range(N):
if sum_of_col != 0:
S[i, j] /= sum_of_col
else:
S[i, j] = 1 / N
# S[i, j] /= sum_of_col
print(S)
# 计算矩阵A
alpha = 0.85
A = alpha * S + (1 - alpha) / N * np.ones([N, N])
print(A)
# 生成初始的PageRank值,记录在P_n中,P_n和P_n1均用于迭代
P_n = np.ones(N) / N
P_n1 = np.zeros(N)
e = 100000 # 误差初始化
k = 0 # 记录迭代次数
print('loop...')
while e > 0.00000001: # 开始迭代
P_n1 = np.dot(A, P_n) # 迭代公式
e = P_n1 - P_n
e = max(map(abs, e)) # 计算误差
P_n = P_n1
k += 1
print('iteration %s:' % str(k), P_n1)
print('final result:', P_n)
page3:
# import os
# import networkx as nx
# os.chdir('C:\\Users\\XXX\\Desktop\\')
# filename = 'Wiki-Vote.txt'
# G=nx.DiGraph()
# with open(filename) as file:
# for line in file:
# head, tail = [int(x) for x in line.split()]
# G.add_edge(head,tail)
#
# pr=nx.pagerank(G,alpha=0.85)
# x = 0;
# for node, value in pr.items():
# x = x + value
# print(x)
import os
import networkx as nx
# D:\PycharmProjects2020\xiha_pagerank\data
os.chdir('D:\\PycharmProjects2020\\xiha_pagerank\\data\\')
filename = 'Wiki-Vote.txt'
G=nx.DiGraph()
with open(filename) as file:
for line in file:
head, tail = [int(x) for x in line.split()]
G.add_edge(head,tail)
pr=nx.pagerank(G,alpha=0.85)
#排序
d2 = sorted(pr.items(), key=lambda x: x[1])
print(d2)
print(pr)
# x = 0;
# for node, value in pr.items():
# x = x + value
# print(x)