google / seq2seq

A general-purpose encoder-decoder framework for Tensorflow

Home Page:https://google.github.io/seq2seq/

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

Seq2Seq: LSTM based Encoder-Decoder Model Cannot Save and Reload Training States.

ruiyuanlu opened this issue · comments

Hi, there. I tried to create a LSTM based English to Chinese Translator. It took me several days to reach a reasonable accuracy, when I tried to save the encoder-decoder model, I got a warning:
image
Although the model saved, I found that when I reloaded the model and tried to train it using

encoder_decoder_model.fit_generator()

The training loss was high and accuracy was very close to 0!!! (As shown below).
Any ideas what was going on here? Thx in advance.
image

Update

I've tried to load weights manually using the following code. Still, the training accuracy after reloading is close to 0.

my save_weights:

import numpy as np
def save_weights(model, path):
    """Save model weights, and return the path."""
    weights_num_per_layer = [len(layer.get_weights()) for layer in encoder_decoder_model.layers]
    np.savez(path, **{'weights_num_per_layer':weights_num_per_layer, 'weights_matrix':model.get_weights()})
    return path if path.endswith('.npz') else "%s.npz" % path

my load_weights:

import numpy as np

def load_weights(model, path):
    """Load model  weights from path, and return the model"""
    loaded = np.load(path)
    weights_num = loaded['weights_num_per_layer']
    weights_matrix = loaded['weights_matrix']
    cnt = 0
    for n,layer in zip(weights_num, model.layers):
        layer.set_weights(weights_matrix[cnt:cnt+n])
        cnt += n
    return model

The following code is the coder of my encoder-decoder model.

Encoder part

encoder_input_layer = Input(shape=(None,), name='encoder_Input')
encder_embedding_layer = Embedding(src_token_num, embedding_dim, name='encoder_Embedding')(encoder_input_layer)
encoder_lstm_1_layer = LSTM(embedding_dim, return_sequences=True, return_state=True, name='encoder_LSTM_1')(encder_embedding_layer)
encoder_lstm_2_layer = LSTM(embedding_dim, return_sequences=True, return_state=True, name='encoder_LSTM_2')(encoder_lstm_1_layer)
encoder_output, state_h, state_c = LSTM(embedding_dim, return_state=True, name='encoder_LSTM_Final')(encoder_lstm_2_layer)
encoder_states = [state_h, state_c] # encoder_output discarded

Decoder part

decoder_input_layer = Input(shape=(None,), name='decoder_Input')
decoder_embedding_output = Embedding(trgt_token_num, embedding_dim, name='decoder_Embedding')(decoder_input_layer)
# Use encoder states to initialize decoder LSTM
decoder_lstm_1_layer = LSTM(embedding_dim, return_sequences=True, return_state=True, name='decoder_LSTM_1')
decoder_lstm_1_layer_output = decoder_lstm_1_layer(decoder_embedding_output, encoder_states)
decoder_lstm_2_layer = LSTM(embedding_dim, return_sequences=True, return_state=True, name='decoder_LSTM_2')
decoder_lstm_2_layer_output = decoder_lstm_2_layer(decoder_lstm_1_layer_output)
# State_h and state_c discarded.
decoder_lstm_fianl_layer = LSTM(embedding_dim, return_sequences=True, return_state=True, name='decoder_LSTM_Final')
decoder_lstm_output, _, _ = decoder_lstm_fianl_layer(decoder_lstm_2_layer_output)
# Classify words
decoder_dense_1_layer = Dense(embedding_dim, activation='relu', name='decoder_Dense_1_relu')
decoder_dense_1_output = decoder_dense_1_layer(decoder_lstm_output)
decoder_dense_final_layer = Dense(trgt_token_num, activation='linear', name='decoder_Dense_Final')
decoder_dense_output = decoder_dense_final_layer(decoder_dense_1_output)

Combine encoder & decoder

encoder_decoder_model = Model(inputs=[encoder_input_layer, decoder_input_layer], outputs=decoder_dense_output)

Compile config & Model summary

encoder_decoder_model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])
encoder_decoder_model.summary()

image

Plot Model structure

image

Training Data Generator

import numpy as np

def init_input_matrix(batch_size, max_src_sen_len, max_trgt_sen_len, trgt_token_num, dtype):
    return (np.zeros((batch_size, max_src_sen_len), dtype=dtype),                 # Encoder input : English ---> encoder
           np.zeros((batch_size, max_trgt_sen_len), dtype=dtype),                 # Decoder input : Chinese ---> decoder
           np.zeros((batch_size, max_trgt_sen_len, trgt_token_num), dtype=dtype)) # Decoder output: decoder ---> Next(Chinesee) sequence

def onehot_generator(src_corpus, trgt_corpus, max_src_sen_len, max_trgt_sen_len, src_index, trgt_index, batch_size=32, dtype='float32'):
    cnt, trgt_token_num = 0, len(trgt_index)
    while 1:
        for src_sen, trgt_sen in zip(src_corpus, trgt_corpus):
            # data init
            if cnt == 0:
                encoder_input_data, decoder_input_data,decoder_output_data = init_input_matrix(batch_size,
                                                                                           max_src_sen_len,
                                                                                           max_trgt_sen_len,
                                                                                           trgt_token_num, dtype)
            # add entry
            for j, src_word in enumerate(src_sen):
                encoder_input_data[cnt, j] = src_index[src_word]
            for k, trgt_word in enumerate(trgt_sen):
                decoder_input_data[cnt, k] = trgt_index[trgt_word]
                if k > 0:
                    decoder_output_data[cnt, k-1, trgt_index[trgt_word]] = 1        # One-hot here
            cnt += 1
            # return data to caller
            if cnt == batch_size:
                cnt = 0
                # x: encoder_input + decoder_input. y: decoder_output
                yield [encoder_input_data, decoder_input_data], decoder_output_data

Training Config

batch_size = 25
ed_gen_history = encoder_decoder_model.fit_generator(onehot_generator(en, cn, max_en_len, max_cn_len, en_index, cn_index, batch_size=batch_size),
                                                     steps_per_epoch=data_num // batch_size,
                                                     steps_per_epoch=2,
                                                     epochs=10)

Environment

sys: win 10
py: Anaconda3-5.1.0-Windows-x86_64 (py3.6)
keras: 2.1.5
tensorflow: 1.6.0 (backend)
CUDA: 9
cudnn: 7