TextRank原理同PageRank一样,这里暂时就不论述:
1.代码实现
# encoding = utf-8
import jieba
# 利用jieba分词
# 返回分割后的word list
def getWords(text):
allGraphs = text.split('\n')
allWords = []
for graph in allGraphs:
allWords.extend(jieba.lcut(graph))
return allWords
# 从本地文件加载停用词
def loadStopWords(fileName):
with open(fileName, 'r') as fp:
content = fp.read()
stopWords = content.split('\n')
return stopWords
# 去除停用词
def filterStopWords(wordList, stopWords):
result = []
for word in wordList:
if word not in stopWords:
result.append(word)
return result
# 得到每个单词的n_gram(关联词,存在边的词)
def getNgramDict(wordList, nGram):
gramDic = {}
for index in range(len(wordList)):
key = wordList[index]
gramWords = []
if key in gramDic:
gramWords = gramDic[key]['gramWords']
start = index - nGram
if start < index:
gramWords.extend(wordList[start:index])
end = index + nGram
if index < end:
gramWords.extend(wordList[index+1:end+1])
gramDic[key] = {'oddScore':0, 'evenScore':1, 'oldScore':0, 'gramWords':list(set(gramWords))}
return gramDic
def countOldScore(gramDic):
totalScore = 0
for key in gramDic:
totalScore += gramDic[key]['oldScore']
return totalScore
def showResult(gramDic):
print("-----------------")
for key in gramDic:
print(key, gramDic[key])
# 迭代次数
def iterStart(gramDic, iterNum=100, alpha=0.85, eps=0.01):
for i in range(iterNum):
if i % 2 == 0:
for key in gramDic:
gramWords = gramDic[key]['gramWords']
score = alpha * gramDic[key]['evenScore'] / len(gramWords)
for word in gramWords:
gramDic[word]['oddScore'] += score
gramDic[key]['oddScore'] += (1-alpha) * gramDic[key]['evenScore']
gramDic[key]['oldScore'] = gramDic[key]['evenScore']
gramDic[key]['evenScore'] = 0
else:
for key in gramDic:
gramWords = gramDic[key]['gramWords']
score = alpha * gramDic[key]['oddScore'] / len(gramWords)
for word in gramWords:
gramDic[word]['evenScore'] += score
gramDic[key]['evenScore'] += (1-alpha) * gramDic[key]['oddScore']
gramDic[key]['oldScore'] = gramDic[key]['oddScore']
gramDic[key]['oddScore'] = 0
count = 0
for key in gramDic:
if i % 2 == 0:
if abs(gramDic[key]['oldScore'] - gramDic[key]['oddScore']) < eps:
count += 1
else:
if abs(gramDic[key]['oldScore'] - gramDic[key]['evenScore']) < eps:
count += 1
if count >= len(gramDic):
print("迭代提前终止,共迭代%d次" %i)
break
# 取得分最高的关键词
def getTopK(gramDic, topK):
result = []
gramDicSorted = sorted(gramDic.items(), key=lambda x: x[1]['oldScore'], reverse=True)
for i in range(len(gramDicSorted)):
if len(gramDicSorted[i][0]) > 1:
result.append(gramDicSorted[i][0])
if len(result) >= topK:
break
return result
def KeyWordsWordsCount(wordList, topK=3):
wordDict = {}
for word in wordList:
wordDict[word] = wordDict.get(word, 0) + 1
result = []
wordDictSorted = sorted(wordDict.items(), key=lambda x: x[1], reverse=True)
for i in range(len(wordDictSorted)):
if len(wordDictSorted[i][0]) > 1:
result.append(wordDictSorted[i][0])
if len(result) >= topK:
break
return result
# TextRank
def KeyWordsTextRank(text, topK=3):
stopWords = loadStopWords("stop_words.txt") # 加载停用词
wordList = getWords(text) # 分词
wordList = filterStopWords(wordList, stopWords) # 去停用词
gramDic = getNgramDict(wordList, 2) # 得到相关的词
iterStart(gramDic) # 迭代
#showResult(gramDic) # 查看迭代后的结果
print(countOldScore(gramDic), len(gramDic)) # 检查是否正确收敛
keyWords = getTopK(gramDic, topK)
print(KeyWordsWordsCount(wordList, 5))
return keyWords
text = '''售单价是5元/只、销售金额为2400元,未获利润,上述不合格产品已不能召回。本案共计货值2400元'''
print(KeyWordsTextRank(text, 5))
2.相关问题
(1)如何保证每次迭代总得分不变?
代码中设置两个得分变量来交替使用。
(2)TextRank一定会收敛吗?
将其转换成Markov过程,并证明其满足Markov过程收敛的条件。
(3)如何高效的进行迭代?