pytorch 实现LSTM

最新推荐文章于 2025-06-18 16:11:35 发布

原创最新推荐文章于 2025-06-18 16:11:35 发布 · 5.1k 阅读

15 ·

CC 4.0 BY-SA版权

文章标签：

#pytorch #LSTM

NLP 专栏收录该内容

19 篇文章

订阅专栏

本文通过使用PyTorch框架实现LSTM网络，对IMDb电影评论进行情感分析，详细介绍了从数据预处理到模型训练的全过程，包括词向量加载、文本编码、批次处理及模型评估。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

Pytorch基础知识点整理

**
梯度下降：

#_*_coding:utf-8_*_
from math import pi
import torch
import torch.optim
x = torch.tensor([pi /3,pi /6],requires_grad = True)
optim = torch.optim.SGD([x,],lr = 0.1,momentum=0)

for step in range(11):
    if step:
        optim.zero_grad()#清零
        f.backward()#计算梯度
        optim.step()#更新参数
    f = - ((x.cos() ** 2).sum()) ** 2       #实验下来f函数的定义也可以放在if前面，结果不变
    print('step:{},x = {},f(x) = {}'.format(step,x.tolist(),f))

Pytorch实现LSTM

#_*_coding:utf-8_*_
import os
import codecs
import string
from nltk.corpus import stopwords
import snowballstemmer
import re
from itertools import chain
from gensim import models
import torch
from torch import nn
import torch.optim as optim
import torch.utils.data.dataloader as dataloader
from torch.autograd import Variable
import time
from sklearn.metrics import accuracy_score

def read_Data(path,seg = 'train'):
    labels = ['pos','neg']
    data = []
    for label in labels:
        files = os.listdir(os.path.join(path,seg,label))
        for file in files:
            file_data = codecs.open(os.path.join(path,seg,label,file),'r','utf-8')
            review = file_data.read().replace('\n','')
            if label == 'pos':
                data.append([review,1])
            elif label == 'neg':
                data.append([review,0])
    return data

train_data = read_Data('/Users/yangyang/Desktop/aclImdb')
test_data = read_Data('/Users/yangyang/Desktop/aclImdb','test')

def tokenizer(text):
    return [token.lower() for token in text.split(' ')]

#全部小写的data [[]]
train_token = []
test_token = []

for text,label in train_data:
    train_token.append(tokenizer(text))
for text,label in test_data:
    test_token.append(tokenizer(text))

#词表去重
vocab = set(chain(*train_token))
vocab_size = len(vocab)

model = models.KeyedVectors.load_word2vec_format('/Users/yangyang/Desktop/glove/word2vec.6B.100d.txt',binary=False,encoding='utf-8')


#给每个词一个特定的编号
word_to_idx = {word:i+1 for i,word in enumerate(vocab)}
word_to_idx['<unk>'] = 0
# idx_to_word = {word_to_idx[k]:k for k,v in word_to_idx}
idx_to_word = {i+1: word for i, word in enumerate(vocab)}
idx_to_word[0] = '<unk>'

#取编码
def encode_samples(data,vocab):
    result = []
    for items in data:
        tmp = []
        for strs in items:
            if strs in word_to_idx:
                tmp.append(word_to_idx[strs])
            else:
                tmp.append(0)
        result.append(tmp)
    return result
#解决长度问题  所有文本的长度统一为max_length
def pad_samples(data,max_length = 500,PAD = 0):
    padded_features = []
    for item in data:
        if len(item) >= max_length:
            pad_item = item[:max_length]
        else:
            pad_item = item
            while(len(pad_item) < max_length):
                pad_item.append(PAD)
        padded_features.append(pad_item)
    return padded_features

train_features = torch.Tensor(pad_samples(encode_samples(train_token,vocab)))
train_labels = torch.Tensor([label for _,label in train_data])
test_features = torch.Tensor(pad_samples(encode_samples(test_data,vocab)))
test_labels = torch.Tensor([label for _,label in test_data])

class LSTM(nn.Module):
    def __init__(self,vocab_size,embed_size,num_hiddens,num_layers,bidirectional,weight,lables,use_gpu,**kwargs):

        super(LSTM,self).__init__(**kwargs)
        self.num_hiddens = num_hiddens
        self.num_layers = num_layers
        self.use_gpu = use_gpu
        self.bidirectional = bidirectional
        self.embedding = nn.Embedding.from_pretrained(weight)
        self.embedding.weight.requires_grad = False
        self.encoder = nn.LSTM(input_size=embed_size,hidden_size=self.num_hiddens,
                               num_layers=num_layers,bidirectional=self.bidirectional
                               ,dropout=0)
        #bidirectional：True则为双向lstm默认为False
        if self.bidirectional:
            self.decoder = nn.Linear(num_hiddens*4,lables)
        else:
            self.decoder = nn.Linear(num_hiddens*2,lables)

    def forward(self, inputs):
        #size(64,500,300)
        embeddings = self.embedding(inputs)
        #permute后(500,64,300)   (batch,seq,input_size)
        #states.size(500,64,200)
        output, hidden = self.encoder(embeddings.permute([1, 0, 2]))
        #states[i] size(64,200)   -> encoding.size(64,400)
        encoding = torch.cat([output[0], output[-1]], dim=1)
        #这里就可以矩阵相乘了
        outputs = self.decoder(encoding)
        return outputs

num_epochs = 5
embed_size = 300
num_hiddens = 100
num_layers = 2
bidirectional = True
batch_size = 64
labels = 2
lr = 0.8
device = torch.device('cuda:0')
use_gpu = True

weight = torch.zeros(vocab_size+1,embed_size)

for i in range(len(model.index2word)):
    try:
        index = word_to_idx[model.index2word[i]]
    except:
        continue
    weight[index,:] = torch.from_numpy(model.get_vector(
        idx_to_word[word_to_idx[model.index2word[i]]]
    ))

net = LSTM(vocab_size=(vocab_size+1),embed_size=embed_size,num_hiddens=num_hiddens,num_layers=num_layers,
           bidirectional=bidirectional,weight=weight,
           lables=labels,use_gpu=use_gpu)
net.to(device)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(filter(lambda p:p.requires_grad,net.parameters()),lr=lr)


train_set = torch.utils.data.TensorDataset(train_features,train_labels)
test_set = torch.utils.data.TensorDataset(test_features,test_labels)

train_iter = torch.utils.data.DataLoader(train_set,batch_size=batch_size,shuffle=True)
test_iter = torch.utils.data.DataLoader(test_set,batch_size=batch_size,shuffle=False)

for epoch in range(num_epochs):
    start = time.time()
    train_loss,test_losses = 0,0
    train_acc,test_acc = 0,0
    n,m = 0,0
    for feature,label in train_iter:
        n += 1
        net.zero_grad()
        feature = Variable(feature.cuda())
        label = Variable(label.cuda())
        score = net(feature)
        loss = loss_function(score,label)
        loss.backward()
        optimizer.step()

        train_acc += accuracy_score(torch.argmax(score.cpu().data,dim=1),label.cpu())
        train_loss += loss

    with torch.no_grad():
        for test_feature,test_label in test_iter:
            m += 1
            test_feature = test_feature.cuda()
            test_label = test_label.cuda()
            test_score = net(test_feature)
            test_loss = loss_function(test_score,test_label)
            test_acc += accuracy_score(torch.argmax(test_score.cpu().data,dim=1),test_label.cpu())
            test_losses += test_loss

    end = time.time()
    runtime = end - start
print('epoch: %d, train loss: %.4f, train acc: %.2f, test loss: %.4f, test acc: %.2f, time: %.2f' %
          (epoch, train_loss.data / n, train_acc / n, test_losses.data / m, test_acc / m, runtime))

参考博客https://samaelchen.github.io/pytorch_lstm_sentiment/