from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer # 用于词形还原
# 获取单词的词性
def get_wordnet_pos(tag):
if tag.startswith('J'):
return "j"
elif tag.startswith('V'):
return "v"
elif tag.startswith('N'):
return "n"
elif tag.startswith('R'):
return "r"
else:
return None
sentence = 'football is a family of team sports that involve, to varying degrees, kicking a ball to score a goal.'
tokens = word_tokenize(sentence) # 第一步:分词
tagged_sent = pos_tag(tokens) # 第二步:获取单词词性
wnl = WordNetLemmatizer() # 第三步:词干提取器
lemmas_sent = []
for tag in tagged_sent:
wordnet_pos = get_wordnet_pos(tag[1]) or wordnet.NOUN # tag[1]指单词词性
lemmas_sent.append(wnl.lemmatize(tag[0], pos=wordnet_pos)) # tag[0]指单词本身
print(lemmas_sent)
NLTK处理文本的三步走
最新推荐文章于 2024-10-23 18:22:47 发布