tensorflow实现简单encode-decode-attention

最新推荐文章于 2023-06-25 16:45:51 发布
原创最新推荐文章于 2023-06-25 16:45:51 发布 · 1k 阅读
1 ·
CC 4.0 BY-SA版权
Artificial Neural Networks 专栏收录该内容
10 篇文章
订阅专栏
本文深入解析了编码层和解码层的实现细节，包括BiLSTM编码、注意力机制的引入，以及训练和预测阶段的解码过程。通过具体代码示例，阐述了如何在深度学习任务中构建高效的序列到序列模型。
摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >
#创建编码层
'''
rnn_size : rnn size
sequence_length : sequence length
num_layers: create layer div
rnn_inputs: input data
keep_prob: save ratio
'''
def encoding_layer(rnn_size, sequence_length, num_layers, rnn_inputs, keep_prob):
#编码层的实现，采用bilstm进行编码
    for layer in range(num_layers):

        with tf.variable_scope('encoder_{}'.format(layer)):#共享内置参数
            # 前向RNN
            cell_fw =  tf.nn.rnn_cell.LSTMCell(num_units=rnn_size,initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
            cell_fw =  tf.nn.rnn_cell.DropoutWrapper(cell=cell_fw,input_keep_prob=keep_prob)
            #使用dropout，防止梯度爆炸或梯度消失
            # 后向RNN
            cell_bw =  tf.nn.rnn_cell.LSTMCell(num_units=rnn_size,initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
            cell_bw =  tf.nn.rnn_cell.DropoutWrapper(cell=cell_bw,input_keep_prob=keep_prob)
            同上
            # 双向RNN
            enc_output, enc_state = tf.nn.bidirectional_dynamic_rnn(cell_fw=cell_fw,cell_bw=cell_bw, inputs=rnn_inputs,sequence_length=sequence_length,dtype=tf.float32)
    # Join outputs since we are using a bidirectional RNN
    #使用动态的bilstm产生outputs，states。

    enc_output = tf.concat(enc_output, 2)
    #真正的output需要连接前向output与反向output
    return enc_output, enc_state

#创建训练的decoder
def training_decoding_layer(dec_embed_input, summary_length, dec_cell, initial_state, output_layer,
                            vocab_size, max_summary_length):

    training_helper = seq.TrainingHelper(inputs=dec_embed_input,sequence_length=summary_length,time_major=False)
    """
    tf.contrib.seq2seq.TrainingHelper
    Decoder的一个类，只能在训练时使用，作用是读取输入。

    __init__(
        inputs,#输入x的embeded, shape = [batch_size, sequence_length, embedding_size] 
        sequence_length,#序列长度
        time_major=False,#如果是True,那么input的 shape =[sequence_length, batch_size, embedding_size] 
        name=None
    )
    示例：

    helper_pt =
    tf.contrib.seq2seq.TrainingHelper(
                    inputs=self.emb_x,
                    sequence_length=self.sequence_lengths,
                    time_major=False
                     )"""
    training_decoder = seq.BasicDecoder(cell=dec_cell,helper=training_helper,initial_state=initial_state,output_layer=output_layer)
    """
	将封装好的带有Attention的lstm单元传入decoder，封装为一个decoder。

    tf.contrib.seq2seq.BasicDecoder
    创建一个基础版的解码器

    __init__(
        cell,#创建的LSTMCell
        helper,#创建的helper_pt
        initial_state,#初始状态self.initial_state
        output_layer=None#解码到全连接层，然后经过softmax
    )

    示例：
    创建Dense层

     from tensorflow.python.layers import core as layers_core
     self.output_layer = layers_core.Dense(self.num_emb, use_bias=False)
    decoder_pt = tf.contrib.seq2seq.BasicDecoder(
        cell=self.decoder_cell,
        helper=helper_pt,
        initial_state=self.initial_state,#或者init_state
        output_layer=self.output_layer
    )
    """
    training_logits, _ = seq.dynamic_decode(training_decoder,output_time_major=False,impute_finished=True,maximum_iterations=max_summary_length)
    """
     tf.contrib.seq2seq.dynamic_decode
构造一个动态的decoder，即根据传入的decoder实例动态解码，其内部通过Decoder对象的一次 initialize()和重复step()操作，其核心是control_flow_ops.while_loop循环
函数返回值：(final_outputs, final_state, final_sequence_lengths)

tf.contrib.seq2seq.dynamic_decode(
    decoder,#一个Decoder实例，即decoder_pt
    output_time_major=False,
    impute_finished=False,
    maximum_iterations=None,
    parallel_iterations=32,
    swap_memory=False,
    scope=None
)
这里在解释一下time_major，在TrainingHelper中和此处，
batch major是指输入中batch_size是第一位元素，即[batch_size, sequence_length, embedding_size]
time major 是指输入中time_step是第一位元素，即[sequence_length, batch_size, embedding_size]
文档中解释： batch major tensors (this adds extra time to the computation）第二种方式计算速度更快

示例：
outputs_pt, _final_state, sequence_lengths_pt = tf.contrib.seq2seq.dynamic_decode(
    decoder=decoder_pt,
    output_time_major=False,
    maximum_iterations=self.max_sequence_length,
    swap_memory=True,
)
self.logits_pt = outputs_pt.rnn_output
self.g_predictions = tf.nn.softmax(self.logits_pt)

其中，final_outputs是一个二维的tuple = (rnn_outputs, sample_id)
rnn_output: [batch_size, sequence_length, vocab_size]，RNN的输出，用于计算tf.nn.softmax（rnn_output）
sample_id: [batch_size], tf.int32，保存最终的编码结果，可以表示最后的答案。

    """
    return training_logits

#创建测试的decoder
def inference_decoding_layer(embeddings, start_token, end_token, dec_cell, initial_state, output_layer,
                             max_summary_length, batch_size):


    start_tokens = tf.tile(tf.constant([start_token], dtype=tf.int32), [batch_size], name='start_tokens')
    """
    用于张量的拓展
    tensorflow中的tile()函数是用来对张量(Tensor)进行扩展的，其特点是对当前张量内的数据进行一定规则的复制。最终的输出张量维度不变。

函数定义：

tf.tile(
    input,
    multiples,
    name=None
)
input是待扩展的张量，multiples是扩展方法。
假如input是一个3维的张量。那么mutiples就必须是一个1x3的1维张量。这个张量的三个值依次表示input的第1，第2，第3维数据扩展几倍。
具体举一个例子：

import tensorflow as tf

a = tf.constant([[1, 2], [3, 4], [5, 6]], dtype=tf.float32)
a1 = tf.tile(a, [2, 3])
with tf.Session() as sess:
    print(sess.run(a))
    print(sess.run(tf.shape(a)))
    print(sess.run(a1))
    print(sess.run(tf.shape(a1)))
=======
[[1. 2.]
 [3. 4.]
 [5. 6.]]
[3 2]
[[1. 2. 1. 2. 1. 2.]
 [3. 4. 3. 4. 3. 4.]
 [5. 6. 5. 6. 5. 6.]
 [1. 2. 1. 2. 1. 2.]
 [3. 4. 3. 4. 3. 4.]
 [5. 6. 5. 6. 5. 6.]]
[6 6]
    """
    inference_helper = seq.GreedyEmbeddingHelper(embeddings,start_tokens, end_token)
    """
    GreedyEmbeddingHelper
    __init__( embedding, start_tokens, end_token )
- embedding: A callable that takes a vector tensor of ids (argmax ids), or the params argument for embedding_lookup. The returned tensor will be passed to the decoder input.
- start_tokens: int32 vector shaped [batch_size], the start tokens.
- end_token: int32 scalar, the token that marks end of decoding.

A helper for use during inference.
Uses the argmax of the output (treated as logits) and passes the result through an embedding layer to get the next input.

官方文档已经说明，这是用于inference阶段的helper，将output输出后的logits使用argmax获得id再经过embedding layer来获取下一时刻的输入。

embedding：params argument for embedding_lookup，也就是 定义的embedding 变量传入即可。
start_tokens： batch中每个序列起始输入的token_id
end_token：序列终止的token_id
    """
    inference_decoder = seq.BasicDecoder(dec_cell,inference_helper,initial_state,output_layer)
    """
    基础decoder，用法同上
    """
    inference_logits, _ = seq.dynamic_decode(inference_decoder,output_time_major=False,impute_finished=True,maximum_iterations=max_summary_length)

    return inference_logits

#创建真正的解码层 引入注意力机制
def decoding_layer(dec_embed_input, embeddings, enc_output, enc_state, vocab_size, text_length, summary_length,
                   max_summary_length, rnn_size, vocab_to_int, keep_prob, batch_size, num_layers):

    for layer in range(num_layers):
        with tf.variable_scope('decoder_{}'.format(layer)):
            lstm = tf.nn.rnn_cell.LSTMCell(rnn_size,initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))#建立lstm单元
            dec_cell = tf.nn.rnn_cell.DropoutWrapper(lstm,input_keep_prob=keep_prob)#dropout
    #全连接层
    output_layer = Dense(vocab_size,kernel_initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1))
    #用于decoder中进行全连接后输出的大小
    attn_mech = seq.BahdanauAttention(rnn_size, enc_output,text_length,normalize=False,name='BahdanauAttention')
    """
    详细参数讲解:
    
    tf.contrib.seq2seq.BahdanauAttention（）

__init__(
    num_units,
    memory,
    memory_sequence_length=None,
    normalize=False,
    probability_fn=None,
    score_mask_value=None,
    dtype=None,
    name='BahdanauAttention'
)
num_units: 用以构造query_layer、memory_layer(俩个Dense layer)，也是Decoder cell的 number of hidden units. 在attention_mechanism 里面用于计算attention score 也就是最后的 alignments , 总和为 1 是概率值，需要先将 query 和 memory 的最后一维度dense 成 num_units 维度
memory: ‘记忆’，encoder context，shape [batch_size, max_time, ...].
memory_sequence_length (optional): Encoder输入的真实长度，shape [batch_size],用以构造mask，将超出的padding部分全部置为-inf.
normalize: boolean. Whether to normalize the energy term.
probability_fn: (optional) A `callable`. 将得分score转换为概率，默认@{tf.nn.softmax}，其他可选@{tf.contrib.seq2seq.hardmax}，@{tf.contrib.sparsemax.sparsemax}. Its signature should be: `probabilities = probability_fn(score)`.
score_mask_value: (optional): 默认float('-inf')负无穷大，当memory_sequence_length 不为None时，用于将超出的padding部分全部置为-inf.
name: Name to use when creating ops.
一般就传入 num_units , memory 这两个参数即可

在需要进行mask的情景 再传入 memory_sequence_length shape([]) == batch_size

说明:
attention mechanism 接收 memory 和 memory_sequence_length 共同组成 attention_mechanism 的 values ，在形成 values 时就已经根据 memory_sequence_length 对超过length 的部分进行了mask ,mask 的结果为 全0 ，其他部分值保存不变
attention_mechanism 的 keys 为 dense 之后的 values ，同样的 mask 的部分dense依然是0
最后的结果 alignments 对于mask 的部分同样也是 0 , 因为在计算 softmax 的时候 他们的score_mask_value 默认都设置为了 float('-inf')负无穷大
    """
    dec_cell = seq.AttentionWrapper(cell=dec_cell,attention_mechanism=attn_mech,attention_layer_size=rnn_size)
    """
    class AttentionWrapper
          接下来，以BahdanauAttention为例，采用顺叙与插叙方式，以class AttentionWrapper为起点进行详述：

def __init__(self,
             cell,
             attention_mechanism,
             attention_layer_size=None,
             alignment_history=False,
             cell_input_fn=None,
             output_attention=True,
             initial_cell_state=None,
             name=None):
cell: rnn cell实例，可以是单个cell，也可以是多个cell stack后的mutli layer rnn
attention_mechanism: 上述的attention mechanism的实例，此处以BahdanauAttention为例
attention_layer_size: 用来控制我们最后生成的attention是怎么得来的，如果是None，则直接返回对应attention mechanism计算得到的加权和向量；如果不是None，则在调用_compute_attention方法时，得到的加权和向量还会与output进行concat，然后再经过一个线性映射，变成维度为attention_layer_size的向量
alignment_history: 主要用于后期的可视化，如果为真，则输出state中alignment_history为TensorArray，记录每个时刻的alignment
cell_input_fn: input送入decoder cell的方式，默认是会将input和上一步计算得到的attention拼接起来送入decoder cell
output_attention: 是否返回attention，如果为False则直接返回rnn cell的输出，注意，无论是否为True，每一个时间步的attention都会存储在AttentionWrapperState的一个实例中
initial_cell_state: 初始状态，此时如果传入，需确保其batch_size与成员函数zero_state所需的参数一致
    """
    # 引入注意力机制
    initial_state = seq.AttentionWrapperState(enc_state[0], _zero_state_tensors(rnn_size,batch_size,tf.float32))
    #创建送入basicdecoder的初始状态，使用输入的最终状态作为attention的初始状态
    with tf.variable_scope("decode"):
        training_logits = training_decoding_layer(dec_embed_input,          #执行训练的解码层
                                                  summary_length,
                                                  dec_cell,
                                                  initial_state,
                                                  output_layer,
                                                  vocab_size,
                                                  max_summary_length)
    with tf.variable_scope("decode", reuse=True):
        inference_logits = inference_decoding_layer(embeddings,             #执行预测的解码层
                                                    vocab_to_int['<GO>'],
                                                    vocab_to_int['<EOS>'],
                                                    dec_cell,
                                                    initial_state,
                                                    output_layer,
                                                    max_summary_length,
                                                    batch_size)
    return training_logits, inference_logits