import gensim
from gensim import corpora
from gensim.models import LdaModel
from gensim.matutils import cossim
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
# 如果您尚未下载nltk的停用词列表,请取消下面的注释并运行一次
# nltk.download('punkt')
# nltk.download('stopwords')
# 数据预处理函数
def preprocess(text):
stop_words = set(stopwords.words('english'))
tokens = word_tokenize(text.lower())
tokens = [word for word in tokens if word.isalpha()] # 仅保留字母
tokens = [word for word in tokens if word not in stop_words] # 去除停用词
return tokens
# 示例文档
documents = [
"Text processing using LDA is interesting.",
"Another document example for LDA.",
"Text mining and natural language processing.",
"LDA helps in topic modeling and finding patterns.",
"This document is for testing LDA similarity."
]
# 数据预处理
texts = [preprocess(doc) for doc in documents]
# 创建词典
dictionary = corpora.Dictionary(texts)
# 转换为词袋模型
corpus
11-09
2315

04-11
950
