#keras #seq2seq
Вопрос:
Я разработал двунаправленную модель seq2seq в keras для абстрактного обобщения текста. Он использует предварительно обученные вложения word2vec и использует реализованный здесь уровень внимания. Структура модели выглядит следующим образом:
content_len = 60
summary_len = 52
#content_embedding_matrix and summary_embedding_matrix are the matrices containing the Word2Vec embedding weights for the words respectively in the content and summaries
#encoder
encoder_input = Input(shape=(content_len,))
encoder_embedding = Embedding(input_dim=content_embedding_matrix.shape[0], output_dim=embedding_dim, weights=[content_embedding_matrix], trainable=False)(encoder_input)
encoder_bidr = Bidirectional(LSTM(embedding_dim, return_sequences=True, return_state=True, dropout=0.3))
encoder_output, forw_hidden_state, forw_cell_state, backw_hidden_state, backw_cell_state = encoder_bidr(
encoder_embedding)
concat_hidden_state = Concatenate()([forw_hidden_state, backw_hidden_state])
concat_cell_state = Concatenate()([forw_cell_state, backw_cell_state])
encoder_states = [concat_hidden_state, concat_cell_state]
# decoder
decoder_input = Input(shape=(summary_len,))
decoder_embedding_layer = Embedding(input_dim=summary_embedding_matrix.shape[0], output_dim=embedding_dim,weights=[summary_embedding_matrix], trainable=False)
decoder_embedding = decoder_embedding_layer(decoder_input)
decoder_lstm = LSTM(embedding_dim*2, return_sequences=True, return_state=True)
decoder_output, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
attn = AttentionLayer(name='AttentionLayer')
context, attn_weights = attn([encoder_output, decoder_output])
concat_decoder_output = Concatenate(axis=-1)([decoder_output, context])
td_layer = TimeDistributed(Dense(summary_embedding_matrix.shape[0], activation='softmax'))
output = td_layer(concat_decoder_output)
m = Model([encoder_input, decoder_input], output)
m.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')
m.load_weights('weights.hdf5') #I previously trained the model, hence I loaded the weights
И это модель вывода, необходимая для получения сводной информации о заданной последовательности:
#encoder inference model
encoder_model = Model(encoder_input, outputs=[encoder_output, encoder_states])
#decoder inference model
decoder_input_h_state = Input(shape=(embedding_dim*2,))
decoder_input_cell_state = Input(shape=(embedding_dim*2,))
decoder_input_hidden_state = Input(shape=(content_len, embedding_dim*2))
decoder_states = [decoder_input_h_state, decoder_input_cell_state]
decoder_embeddings = decoder_embedding_layer(decoder_input)
decoder_outputs2, state_h2, state_c2 = decoder_lstm(decoder_embeddings, initial_state=decoder_states)
context, attn_weights = attn([decoder_input_hidden_state, decoder_outputs2])
concat_decoder = Concatenate(axis=-1, name='concat_layer')([decoder_outputs2, context])
decoder_outputs2 = td_layer(concat_decoder)
# Final decoder model
decoder_model = Model(
[decoder_input] [decoder_input_hidden_state, decoder_input_h_state, decoder_input_cell_state],
[decoder_outputs2] [state_h2, state_c2])
Наконец, функция, которая использует модель вывода для создания целевой последовательности:
def decode_sequence(input_seq, word_to_index_summary, encoder_model, decoder_model, summary_len):
# Encode the input as state vectors.
e_out, e_states = encoder_model.predict(input_seq)
# Generate empty target sequence of length 1.
target_seq = np.zeros((1,1))
# Populate the first word of target sequence with the start word.
target_seq[0, 0] = word_to_index_summary['START']
stop_condition = False
decoded_sentence = ''
while not stop_condition:
output_tokens, h, c = decoder_model.predict([target_seq] [e_out, e_states])
# Sample a token
sampled_token_index = np.argmax(output_tokens[0, -1, :])
sampled_token = index_to_word_summary[sampled_token_index]
if(sampled_token != 'END'):
decoded_sentence = ' ' sampled_token
# Exit condition: either hit max length or find stop word.
if (sampled_token == '_END_' or len(decoded_sentence.split()) >= (summary_len - 1)):
stop_condition = True
# Update the target sequence (of length 1).
target_seq = np.zeros((1,1))
target_seq[0, 0] = sampled_token_index
# Update internal states
e_h, e_c = h, c
return decoded_sentence
But when I run decode_sequence, all I get is the repetition of the ‘START’ token, for example:
Content: sub ends 26th renewing normally would renew 26th insert time 16 bucks however time seven days added without charging 16 bucks new month seven days
Original summary: once your subscription ends even if renewing the 7 days will be added before the next billing cycle
Predicted summary: START START START START START START START START START START START START START START START START START START START START START START START START START START START START START START START START START START START START START START START START START START START START START START START START START START START
I understand that in decode_sequence
when sampled_token_index = np.argmax(output_tokens[0, -1, :])
is called, it computes always the same value. But I don’t get why and I’m stuck since I can’t even understand if the problem is in decode_sequence
or in the inference model. Can somebody give me a hint? Habe you ever faced this problem?