#创建编码层
'''
rnn_size : rnn size
sequence_length : sequence length
num_layers: create layer div
rnn_inputs: input data
keep_prob: save ratio
'''
def encoding_layer(rnn_size, sequence_length, num_layers, rnn_inputs, keep_prob):
#编码层的实现,采用bilstm进行编码
for layer in range(num_layers):
with tf.variable_scope('encoder_{}'.format(layer)):#共享内置参数
# 前向RNN
cell_fw = tf.nn.rnn_cell.LSTMCell(num_units=rnn_size,initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
cell_fw = tf.nn.rnn_cell.DropoutWrapper(cell=cell_fw,input_keep_prob=keep_prob)
#使用dropout,防止梯度爆炸或梯度消失
# 后向RNN
cell_bw = tf.nn.rnn_cell.LSTMCell(num_units=rnn_size,initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
cell_bw = tf.nn.rnn_cell.DropoutWrapper(cell=cell_bw,input_keep_prob=keep_prob)
同上
# 双向RNN
enc_output, enc_state = tf.nn.bidirectional_dynamic_rnn(cell_fw=cell_fw,cell_bw=cell_bw, inputs=rnn_inputs,sequence_length=sequence_length,dtype=tf.float32)
# Join outputs since we are using a bidirectional RNN
#使用动态的bilstm产生outputs,states。
enc_output = tf.concat(enc_output, 2)
#真正的output需要连接前向output与反向output
return enc_output, enc_state
#创建训练的decoder
def training_decoding_layer(dec_embed_input, summary_length, dec_cell, initial_state, output_layer,
vocab_size, max_summary_length):
training_helper = seq.TrainingHelper(inputs=dec_embed_input,sequence_length=summary_length,time_major=False)
"""
tf.contrib.seq2seq.TrainingHelper
Decoder的一个类,只能在训练时使用,作用是读取输入。
__init__(
inputs,#输入x的embeded, shape = [batch_size, sequence_length, embedding_size]
sequence_length,#序列长度
time_major=False,#如果是True,那么input的 shape =[sequence_length, batch_size, embedding_size]
name=None
)
示例:
helper_pt =
tf.contrib.seq2seq.TrainingHelper(
inputs=self.emb_x,
sequence_length=self.sequence_lengths,
time_major=False
)"""
training_decoder = seq.BasicDecoder(cell=dec_cell,helper=training_helper,initial_state=initial_state,output_layer=output_layer)
"""
将封装好的带有Attention的lstm单元传入decoder,封装为一个decoder。
tf.contrib.seq2seq.BasicDecoder
创建一个基础版的解码器
__init__(
cell,#创建的LSTMCell
helper,#创建的helper_pt
initial_state,#初始状态self.initial_state
output_layer=None#解码到全连接层,然后经过softmax
)
示例:
创建Dense层
from tensorflow.python.layers import core as layers_core
self.output_layer = layers_core.Dense(self.num_emb, use_bias=False)
decoder_pt = tf.contrib.seq2seq.BasicDecoder(
cell=self.decoder_cell,
helper=helper_pt,
initial_state=self.initial_state,#或者init_state
output_layer=self.output_layer
)
"""
training_logits, _ = seq.dynamic_decode(training_decoder,output_time_major=False,impute_finished=True,maximum_iterations=max_summary_length)
"""
tf.contrib.seq2seq.dynamic_decode
构造一个动态的decoder,即根据传入的decoder实例动态解码,其内部通过Decoder对象的一次 initialize()和重复step()操作,其核心是control_flow_ops.while_loop循环
函数返回值:(final_outputs, final_state, final_sequence_lengths)
tf.contrib.seq2seq.dynamic_decode(
decoder,#一个Decoder实例,即decoder_pt
output_time_major=False,
impute_finished=False,
maximum_iterations=None,
parallel_iterations=32,
swap_memory=False,
scope=None
)
这里在解释一下time_major,在TrainingHelper中和此处,
batch major是指输入中batch_size是第一位元素,即[batch_size, sequence_length, embedding_size]
time major 是指输入中time_step是第一位元素,即[sequence_length, batch_size, embedding_size]
文档中解释: batch major tensors (this adds extra time to the computation)第二种方式计算速度更快
示例:
outputs_pt, _final_state, sequence_lengths_pt = tf.contrib.seq2seq.dynamic_decode(
decoder=decoder_pt,
output_time_major=False,
maximum_iterations=self.max_sequence_length,
swap_memory=True,
)
self.logits_pt = outputs_pt.rnn_output
self.g_predictions = tf.nn.softmax(self.logits_pt)
其中,final_outputs是一个二维的tuple = (rnn_outputs, sample_id)
rnn_output: [batch_size, sequence_length, vocab_size],RNN的输出,用于计算tf.nn.softmax(rnn_output)
sample_id: [batch_size], tf.int32,保存最终的编码结果,可以表示最后的答案。
"""
return training_logits
#创建测试的decoder
def inference_decoding_layer(embeddings, start_token, end_token, dec_cell, initial_state, output_layer,
max_summary_length, batch_size):
start_tokens = tf.tile(tf.constant([start_token], dtype=tf.int32), [batch_size], name='start_tokens')
"""
用于张量的拓展
tensorflow中的tile()函数是用来对张量(Tensor)进行扩展的,其特点是对当前张量内的数据进行一定规则的复制。最终的输出张量维度不变。
函数定义:
tf.tile(
input,
multiples,
name=None
)
input是待扩展的张量,multiples是扩展方法。
假如input是一个3维的张量。那么mutiples就必须是一个1x3的1维张量。这个张量的三个值依次表示input的第1,第2,第3维数据扩展几倍。
具体举一个例子:
import tensorflow as tf
a = tf.constant([[1, 2], [3, 4], [5, 6]], dtype=tf.float32)
a1 = tf.tile(a, [2, 3])
with tf.Session() as sess:
print(sess.run(a))
print(sess.run(tf.shape(a)))
print(sess.run(a1))
print(sess.run(tf.shape(a1)))
=======
[[1. 2.]
[3. 4.]
[5. 6.]]
[3 2]
[[1. 2. 1. 2. 1. 2.]
[3. 4. 3. 4. 3. 4.]
[5. 6. 5. 6. 5. 6.]
[1. 2. 1. 2. 1. 2.]
[3. 4. 3. 4. 3. 4.]
[5. 6. 5. 6. 5. 6.]]
[6 6]
"""
inference_helper = seq.GreedyEmbeddingHelper(embeddings,start_tokens, end_token)
"""
GreedyEmbeddingHelper
__init__( embedding, start_tokens, end_token )
- embedding: A callable that takes a vector tensor of ids (argmax ids), or the params argument for embedding_lookup. The returned tensor will be passed to the decoder input.
- start_tokens: int32 vector shaped [batch_size], the start tokens.
- end_token: int32 scalar, the token that marks end of decoding.
A helper for use during inference.
Uses the argmax of the output (treated as logits) and passes the result through an embedding layer to get the next input.
官方文档已经说明,这是用于inference阶段的helper,将output输出后的logits使用argmax获得id再经过embedding layer来获取下一时刻的输入。
embedding:params argument for embedding_lookup,也就是 定义的embedding 变量传入即可。
start_tokens: batch中每个序列起始输入的token_id
end_token:序列终止的token_id
"""
inference_decoder = seq.BasicDecoder(dec_cell,inference_helper,initial_state,output_layer)
"""
基础decoder,用法同上
"""
inference_logits, _ = seq.dynamic_decode(inference_decoder,output_time_major=False,impute_finished=True,maximum_iterations=max_summary_length)
return inference_logits
#创建真正的解码层 引入注意力机制
def decoding_layer(dec_embed_input, embeddings, enc_output, enc_state, vocab_size, text_length, summary_length,
max_summary_length, rnn_size, vocab_to_int, keep_prob, batch_size, num_layers):
for layer in range(num_layers):
with tf.variable_scope('decoder_{}'.format(layer)):
lstm = tf.nn.rnn_cell.LSTMCell(rnn_size,initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))#建立lstm单元
dec_cell = tf.nn.rnn_cell.DropoutWrapper(lstm,input_keep_prob=keep_prob)#dropout
#全连接层
output_layer = Dense(vocab_size,kernel_initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1))
#用于decoder中进行全连接后输出的大小
attn_mech = seq.BahdanauAttention(rnn_size, enc_output,text_length,normalize=False,name='BahdanauAttention')
"""
详细参数讲解:
tf.contrib.seq2seq.BahdanauAttention()
__init__(
num_units,
memory,
memory_sequence_length=None,
normalize=False,
probability_fn=None,
score_mask_value=None,
dtype=None,
name='BahdanauAttention'
)
num_units: 用以构造query_layer、memory_layer(俩个Dense layer),也是Decoder cell的 number of hidden units. 在attention_mechanism 里面用于计算attention score 也就是最后的 alignments , 总和为 1 是概率值,需要先将 query 和 memory 的最后一维度dense 成 num_units 维度
memory: ‘记忆’,encoder context,shape [batch_size, max_time, ...].
memory_sequence_length (optional): Encoder输入的真实长度,shape [batch_size],用以构造mask,将超出的padding部分全部置为-inf.
normalize: boolean. Whether to normalize the energy term.
probability_fn: (optional) A `callable`. 将得分score转换为概率,默认@{tf.nn.softmax},其他可选@{tf.contrib.seq2seq.hardmax},@{tf.contrib.sparsemax.sparsemax}. Its signature should be: `probabilities = probability_fn(score)`.
score_mask_value: (optional): 默认float('-inf')负无穷大,当memory_sequence_length 不为None时,用于将超出的padding部分全部置为-inf.
name: Name to use when creating ops.
一般就传入 num_units , memory 这两个参数即可
在需要进行mask的情景 再传入 memory_sequence_length shape([]) == batch_size
说明:
attention mechanism 接收 memory 和 memory_sequence_length 共同组成 attention_mechanism 的 values ,在形成 values 时就已经根据 memory_sequence_length 对超过length 的部分进行了mask ,mask 的结果为 全0 ,其他部分值保存不变
attention_mechanism 的 keys 为 dense 之后的 values ,同样的 mask 的部分dense依然是0
最后的结果 alignments 对于mask 的部分同样也是 0 , 因为在计算 softmax 的时候 他们的score_mask_value 默认都设置为了 float('-inf')负无穷大
"""
dec_cell = seq.AttentionWrapper(cell=dec_cell,attention_mechanism=attn_mech,attention_layer_size=rnn_size)
"""
class AttentionWrapper
接下来,以BahdanauAttention为例,采用顺叙与插叙方式,以class AttentionWrapper为起点进行详述:
def __init__(self,
cell,
attention_mechanism,
attention_layer_size=None,
alignment_history=False,
cell_input_fn=None,
output_attention=True,
initial_cell_state=None,
name=None):
cell: rnn cell实例,可以是单个cell,也可以是多个cell stack后的mutli layer rnn
attention_mechanism: 上述的attention mechanism的实例,此处以BahdanauAttention为例
attention_layer_size: 用来控制我们最后生成的attention是怎么得来的,如果是None,则直接返回对应attention mechanism计算得到的加权和向量;如果不是None,则在调用_compute_attention方法时,得到的加权和向量还会与output进行concat,然后再经过一个线性映射,变成维度为attention_layer_size的向量
alignment_history: 主要用于后期的可视化,如果为真,则输出state中alignment_history为TensorArray,记录每个时刻的alignment
cell_input_fn: input送入decoder cell的方式,默认是会将input和上一步计算得到的attention拼接起来送入decoder cell
output_attention: 是否返回attention,如果为False则直接返回rnn cell的输出,注意,无论是否为True,每一个时间步的attention都会存储在AttentionWrapperState的一个实例中
initial_cell_state: 初始状态,此时如果传入,需确保其batch_size与成员函数zero_state所需的参数一致
"""
# 引入注意力机制
initial_state = seq.AttentionWrapperState(enc_state[0], _zero_state_tensors(rnn_size,batch_size,tf.float32))
#创建送入basicdecoder的初始状态,使用输入的最终状态作为attention的初始状态
with tf.variable_scope("decode"):
training_logits = training_decoding_layer(dec_embed_input, #执行训练的解码层
summary_length,
dec_cell,
initial_state,
output_layer,
vocab_size,
max_summary_length)
with tf.variable_scope("decode", reuse=True):
inference_logits = inference_decoding_layer(embeddings, #执行预测的解码层
vocab_to_int['<GO>'],
vocab_to_int['<EOS>'],
dec_cell,
initial_state,
output_layer,
max_summary_length,
batch_size)
return training_logits, inference_logits
tensorflow实现简单encode-decode-attention
最新推荐文章于 2023-06-25 16:45:51 发布