如何使用决策树实现文本分类:
1.分词模块:
a.人工整理词库。
b.使用自己编写的词库和分词代码(添加词库的同时,删除影响词库的词)进行分词。
2.进行one-hot编码:
在进行one-hot编码的同时,使用同义词对one-hot进行合并,同义词使用相同的id.
3.使用sklearn里面的决策树进行计算,并且使用sklearn里面的自动化调参进行调参。
代码如下:
#!/usr/bin/env python
# coding=utf-8
import numpy as np
from sklearn import tree
from sklearn.cross_validation import train_test_split
import pandas as pd
import jieba
import pydot
from sklearn.externals.six import StringIO
import re
# jieba.add_word("导诊台")
# jieba.del_word("机能")
# jieba.add_word("自助机")
# jieba.add_word("胶片袋")
# jieba.add_word("巡诊室")
# jieba.del_word("片东院")
# jieba.add_word("东院")
class node:
def __init__(self,key=None,value=None,condition=None,left_id=None, right_id=None):
self.key=key
self.condition=condition
self.value=value
self.left_id= left_id
self.right_id = right_id
def segment():
"""
分词方法
:return:
"""
typedict = {0: str}
# result_file=open("segment_result.txt","w",encoding="utf8")
train_data = pd.read_csv("E:/协和问答系统/SenLiu/voice.csv", dtype=typedict)
jieba.load_userdict("E:/协和问答系统/SenLiu/words.txt")
for row in train_data.index:
datas = train_data.loc[row].values[0]
words = list(jieba.cut(datas))
del_set = []
words_file = open("E:/协和问答系统/SenLiu/words.txt", "r", encoding="utf8")
if type(datas) != float:
for word in words_file:
if word.strip() != "":
a