Seq2Seq: LSTM based Encoder-Decoder Model Cannot Save and Reload Training States.
ruiyuanlu opened this issue · comments
Hi, there. I tried to create a LSTM based English to Chinese Translator. It took me several days to reach a reasonable accuracy, when I tried to save the encoder-decoder model, I got a warning:
Although the model saved, I found that when I reloaded the model and tried to train it using
encoder_decoder_model.fit_generator()
The training loss was high and accuracy was very close to 0!!! (As shown below).
Any ideas what was going on here? Thx in advance.
Update
I've tried to load weights manually using the following code. Still, the training accuracy after reloading is close to 0.
my save_weights:
import numpy as np
def save_weights(model, path):
"""Save model weights, and return the path."""
weights_num_per_layer = [len(layer.get_weights()) for layer in encoder_decoder_model.layers]
np.savez(path, **{'weights_num_per_layer':weights_num_per_layer, 'weights_matrix':model.get_weights()})
return path if path.endswith('.npz') else "%s.npz" % path
my load_weights:
import numpy as np
def load_weights(model, path):
"""Load model weights from path, and return the model"""
loaded = np.load(path)
weights_num = loaded['weights_num_per_layer']
weights_matrix = loaded['weights_matrix']
cnt = 0
for n,layer in zip(weights_num, model.layers):
layer.set_weights(weights_matrix[cnt:cnt+n])
cnt += n
return model
The following code is the coder of my encoder-decoder model.
Encoder part
encoder_input_layer = Input(shape=(None,), name='encoder_Input')
encder_embedding_layer = Embedding(src_token_num, embedding_dim, name='encoder_Embedding')(encoder_input_layer)
encoder_lstm_1_layer = LSTM(embedding_dim, return_sequences=True, return_state=True, name='encoder_LSTM_1')(encder_embedding_layer)
encoder_lstm_2_layer = LSTM(embedding_dim, return_sequences=True, return_state=True, name='encoder_LSTM_2')(encoder_lstm_1_layer)
encoder_output, state_h, state_c = LSTM(embedding_dim, return_state=True, name='encoder_LSTM_Final')(encoder_lstm_2_layer)
encoder_states = [state_h, state_c] # encoder_output discarded
Decoder part
decoder_input_layer = Input(shape=(None,), name='decoder_Input')
decoder_embedding_output = Embedding(trgt_token_num, embedding_dim, name='decoder_Embedding')(decoder_input_layer)
# Use encoder states to initialize decoder LSTM
decoder_lstm_1_layer = LSTM(embedding_dim, return_sequences=True, return_state=True, name='decoder_LSTM_1')
decoder_lstm_1_layer_output = decoder_lstm_1_layer(decoder_embedding_output, encoder_states)
decoder_lstm_2_layer = LSTM(embedding_dim, return_sequences=True, return_state=True, name='decoder_LSTM_2')
decoder_lstm_2_layer_output = decoder_lstm_2_layer(decoder_lstm_1_layer_output)
# State_h and state_c discarded.
decoder_lstm_fianl_layer = LSTM(embedding_dim, return_sequences=True, return_state=True, name='decoder_LSTM_Final')
decoder_lstm_output, _, _ = decoder_lstm_fianl_layer(decoder_lstm_2_layer_output)
# Classify words
decoder_dense_1_layer = Dense(embedding_dim, activation='relu', name='decoder_Dense_1_relu')
decoder_dense_1_output = decoder_dense_1_layer(decoder_lstm_output)
decoder_dense_final_layer = Dense(trgt_token_num, activation='linear', name='decoder_Dense_Final')
decoder_dense_output = decoder_dense_final_layer(decoder_dense_1_output)
Combine encoder & decoder
encoder_decoder_model = Model(inputs=[encoder_input_layer, decoder_input_layer], outputs=decoder_dense_output)
Compile config & Model summary
encoder_decoder_model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])
encoder_decoder_model.summary()
Plot Model structure
Training Data Generator
import numpy as np
def init_input_matrix(batch_size, max_src_sen_len, max_trgt_sen_len, trgt_token_num, dtype):
return (np.zeros((batch_size, max_src_sen_len), dtype=dtype), # Encoder input : English ---> encoder
np.zeros((batch_size, max_trgt_sen_len), dtype=dtype), # Decoder input : Chinese ---> decoder
np.zeros((batch_size, max_trgt_sen_len, trgt_token_num), dtype=dtype)) # Decoder output: decoder ---> Next(Chinesee) sequence
def onehot_generator(src_corpus, trgt_corpus, max_src_sen_len, max_trgt_sen_len, src_index, trgt_index, batch_size=32, dtype='float32'):
cnt, trgt_token_num = 0, len(trgt_index)
while 1:
for src_sen, trgt_sen in zip(src_corpus, trgt_corpus):
# data init
if cnt == 0:
encoder_input_data, decoder_input_data,decoder_output_data = init_input_matrix(batch_size,
max_src_sen_len,
max_trgt_sen_len,
trgt_token_num, dtype)
# add entry
for j, src_word in enumerate(src_sen):
encoder_input_data[cnt, j] = src_index[src_word]
for k, trgt_word in enumerate(trgt_sen):
decoder_input_data[cnt, k] = trgt_index[trgt_word]
if k > 0:
decoder_output_data[cnt, k-1, trgt_index[trgt_word]] = 1 # One-hot here
cnt += 1
# return data to caller
if cnt == batch_size:
cnt = 0
# x: encoder_input + decoder_input. y: decoder_output
yield [encoder_input_data, decoder_input_data], decoder_output_data
Training Config
batch_size = 25
ed_gen_history = encoder_decoder_model.fit_generator(onehot_generator(en, cn, max_en_len, max_cn_len, en_index, cn_index, batch_size=batch_size),
steps_per_epoch=data_num // batch_size,
steps_per_epoch=2,
epochs=10)
Environment
sys: win 10
py: Anaconda3-5.1.0-Windows-x86_64 (py3.6)
keras: 2.1.5
tensorflow: 1.6.0 (backend)
CUDA: 9
cudnn: 7
Take a look at keras-team/keras#9119 @simra's answer provides an example of how to do it: https://github.com/simra/keras/blob/simra/s2srestore/examples/lstm_seq2seq_restore.py