import jieba
def get_text(filepath):
f = open(filepath,'r',encoding="utf-8")
text = f.read()
f.close()
return text
def word_freq(filepath,text,topn):
words = jieba.lcut(text.strip())
counts = {}
for word in words:
if len(word) ==1:
continue
counts[word] = counts.get(word,0)+1
items = list(counts.items())
items.sort(key = lambda x:x[1],reverse=True )
f = open(filepath[:-4]+'_词频.txt','w')
for i in range(topn):
word , count = items[i]
f.writelines("{}\t{}\n".format(word,count))
f.close()
运行主函数后
UnicodeDecodeError:‘utf-8’ codec can’t decode byte Oxff in position 0:invalid start byte
在网上找到的办法是
将encoding=“utf-8” 改为errors=“ignore”
但这个方法仅对个别文本有效,部分文本使用这个之后出现字母乱码
在此要特别感谢我朋友点醒了我
在记事本下方有
此类文本改 f = open(filepath,‘r’, encoding=‘utf-16’, errors = “ignore”)
f = open(filepath,‘r’, encoding=‘gbk’, errors = “ignore”)
以此类推
问题解决