1.数据集的收集清洗
使用Movie Review Data数据集,分为正负两种评论,将评论作为时序数据,取文件的前100个单词作为一个序列,每个单词又由词袋模型进行编码。
MAX_DOCUMENT_LENGTH = 200
EMBEDDING_SIZE = 50
n_words=0
def load_data():
x=[]
y=[]
x1,y1=load_files("../data/movie-review-data/review_polarity/txt_sentoken/pos/",0)
x2,y2=load_files("../data/movie-review-data/review_polarity/txt_sentoken/neg/", 1)
x=x1+x2
y=y1+y2
return x,y
2.训练
def do_rnn(trainX, testX, trainY, testY):
global n_words
# Data preprocessing
# Sequence padding
print "GET n_words embedding %d" % n_words
trainX = pad_sequences(trainX, maxlen=MAX_DOCUMENT_LENGTH, value=0.)
testX = pad_sequences(testX, maxlen=MAX_DOCUMENT_LENGTH, value=0.)
# Converting labels to binary vectors
trainY = to_categorical(trainY, nb_classes=2)
testY = to_categorical(testY, nb_classes=2)
# Network building
net = tflearn.input_data([None, MAX_DOCUMENT_LE