textRank原理及实现

本文深入解析了TextRank算法的工作原理,包括代码实现细节,如如何通过jieba分词、加载停用词、去除停用词、获取n_gram等步骤进行关键词提取。同时探讨了算法的迭代过程及收敛条件,提供了具体实例。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

TextRank原理同PageRank一样,这里暂时就不论述:

1.代码实现

# encoding = utf-8
import jieba

# 利用jieba分词
# 返回分割后的word list
def getWords(text):
	allGraphs = text.split('\n')
	allWords = []
	for graph in allGraphs:
		allWords.extend(jieba.lcut(graph))
	return allWords

# 从本地文件加载停用词
def loadStopWords(fileName):
	with open(fileName, 'r') as fp:
		content = fp.read()
		stopWords = content.split('\n')
	return stopWords

# 去除停用词
def filterStopWords(wordList, stopWords):
	result = []
	for word in wordList:
		if word not in stopWords:
			result.append(word)
	return result


# 得到每个单词的n_gram(关联词,存在边的词)
def getNgramDict(wordList, nGram):
	gramDic = {}
	for index in range(len(wordList)):
		key = wordList[index]
		gramWords = []
		if key in gramDic:
			gramWords = gramDic[key]['gramWords']
		start = index - nGram 
		if start < index:
			gramWords.extend(wordList[start:index])
		end = index + nGram 
		if index < end:
			gramWords.extend(wordList[index+1:end+1])
		gramDic[key] = {'oddScore':0, 'evenScore':1, 'oldScore':0, 'gramWords':list(set(gramWords))}
	return gramDic

def countOldScore(gramDic):
	totalScore = 0
	for key in gramDic:
		totalScore += gramDic[key]['oldScore']
	return totalScore

def showResult(gramDic):
	print("-----------------")
	for key in gramDic:
		print(key, gramDic[key]) 

# 迭代次数
def iterStart(gramDic, iterNum=100, alpha=0.85, eps=0.01):
	for i in range(iterNum):
		if i % 2 == 0:
			for key in gramDic:
				gramWords = gramDic[key]['gramWords']
				score = alpha * gramDic[key]['evenScore'] / len(gramWords)
				for word in gramWords:
					gramDic[word]['oddScore'] += score
				gramDic[key]['oddScore'] += (1-alpha) * gramDic[key]['evenScore']
				gramDic[key]['oldScore'] = gramDic[key]['evenScore']
				gramDic[key]['evenScore'] = 0
		else:
			for key in gramDic:
				gramWords = gramDic[key]['gramWords']
				score =  alpha * gramDic[key]['oddScore'] / len(gramWords)
				for word in gramWords:
					gramDic[word]['evenScore'] += score
				gramDic[key]['evenScore'] += (1-alpha) * gramDic[key]['oddScore']
				gramDic[key]['oldScore'] = gramDic[key]['oddScore']
				gramDic[key]['oddScore'] = 0
		count = 0
		for key in gramDic:
			if i % 2 == 0:
				if abs(gramDic[key]['oldScore'] - gramDic[key]['oddScore']) < eps:
					count += 1
			else:
				if abs(gramDic[key]['oldScore'] - gramDic[key]['evenScore']) < eps:
					count += 1
		if count >= len(gramDic):
			print("迭代提前终止,共迭代%d次" %i)
			break

# 取得分最高的关键词
def getTopK(gramDic, topK):
	result = []
	gramDicSorted = sorted(gramDic.items(), key=lambda x: x[1]['oldScore'], reverse=True)
	for i in range(len(gramDicSorted)):
		if len(gramDicSorted[i][0]) > 1:
			result.append(gramDicSorted[i][0])
		if len(result) >= topK:
			break
	return result

def KeyWordsWordsCount(wordList, topK=3):
	wordDict = {}
	for word in wordList:
		wordDict[word] = wordDict.get(word, 0) + 1
	
	result = []
	wordDictSorted = sorted(wordDict.items(), key=lambda x: x[1], reverse=True)
	for i in range(len(wordDictSorted)):
		if len(wordDictSorted[i][0]) > 1:
			result.append(wordDictSorted[i][0])
		if len(result) >= topK:
			break
	return result 

# TextRank
def KeyWordsTextRank(text, topK=3):
	stopWords = loadStopWords("stop_words.txt") # 加载停用词
	wordList = getWords(text) # 分词
	wordList = filterStopWords(wordList, stopWords) # 去停用词
	gramDic = getNgramDict(wordList, 2) # 得到相关的词
	iterStart(gramDic) # 迭代
	#showResult(gramDic) # 查看迭代后的结果
	print(countOldScore(gramDic), len(gramDic)) # 检查是否正确收敛
	keyWords = getTopK(gramDic, topK)
	print(KeyWordsWordsCount(wordList, 5))
	return keyWords

text = '''售单价是5元/只、销售金额为2400元,未获利润,上述不合格产品已不能召回。本案共计货值2400元'''
print(KeyWordsTextRank(text, 5))

2.相关问题

(1)如何保证每次迭代总得分不变?

代码中设置两个得分变量来交替使用。

 

(2)TextRank一定会收敛吗?

将其转换成Markov过程,并证明其满足Markov过程收敛的条件。

 

(3)如何高效的进行迭代?

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值