[BUG] Actor and critic loss are nan as well as predicted action.
stijnmeels opened this issue · comments
Dear @takuseno,
I currently have transformed a BCQ d3rlpy 1.1.1 code into 2.3.0. but when (small number of steps to make sure the code runs) training the model the values for the critic loss and actor loss appear as nan whilst the imitator loss has a regular value. Also when saving this model and making predictions the action values (continuous in range [0,1]) appear to be nan. I have added the main part of my code below. Lastly the time between each epoch is around 10 minutes as you can see)
import numpy as np import pandas as pd import tqdm from d3rlpy.algos import BCQConfig from d3rlpy.dataset import MDPDataset, ReplayBuffer, InfiniteBuffer from d3rlpy.metrics import TDErrorEvaluator, AverageValueEstimationEvaluator, InitialStateValueEstimationEvaluator from sklearn.model_selection import
train_test_split`
def create_MDP_dataset(df, C=[4], test_size=0.2, random_state=42):
# extract unique flight IDs
unique_f = df[['FL', 'FLD', 'C']].drop_duplicates()
# split the unique flights into training and test sets
train_f, test_f = train_test_split(unique_f, test_size=test_size, random_state=random_state)
# create MDPDataset objects for training and test sets
train_dataset, train_f = create_MDP_dataset_from_df(df, train_f, C)
test_dataset, test_f = create_MDP_dataset_from_df(df, test_f, C)
return train_dataset, test_dataset, train_f, test_f
def create_MDP_dataset_from_df(df, f_df, C=[4]):
f_list = []
observations = []
actions = []
rewards = []
terminals = []
f_of_interest = df.merge(f_df, on=['FL', 'FLD', 'C'])
grouped_f = f_of_interest.groupby(['FL', 'FLD', 'C'])
for idx, (f_data, f_data) in enumerate(grouped_f):
observations.append(np.column_stack([f_data['TF'].values, f_data['S'].values, f_data['LFG'].values]))
actions.append(f_data['NB'].values)
rewards.append(f_data['R'].values)
terminals.append(np.append(np.zeros((f_data.shape[0] - 1,)), 1))
f_list.append([f_data[0], f_data[1], f_data[2]])
f = pd.DataFrame(f_list, columns=['FL', 'FLD', 'C'])
observations = np.vstack(observations)
actions = np.reshape(np.concatenate(actions), (-1, 1))
rewards = np.concatenate(rewards)
terminals = np.concatenate(terminals)
dataset = MDPDataset(observations, actions, rewards, terminals)
return dataset, f
train_dataset2019, test_dataset2019,train_f2019, test_f2019 = create_MDP_dataset(df2019)
############################## TRAIN MODEL #####################################
with open('train_dataset 2019 v1.h5','w+b') as f:
train_dataset2019.dump(f)
with open('test_dataset 2019 v1.h5','w+b') as f:
test_dataset2019.dump(f)
with open('train_dataset 2019 v1.h5','rb') as f:
train_dataset = ReplayBuffer.load(f,InfiniteBuffer())
with open('test_dataset 2019 v1.h5','rb') as f:
test_dataset = ReplayBuffer.load(f,InfiniteBuffer())
bcq = BCQConfig().create(device = None)
bcq.build_with_dataset(train_dataset)
batch_size = 100
num_epochs = 100
train_episodes = train_dataset.episodes
total_samples = len(train_episodes) * len(train_episodes[0])
steps_per_epoch = total_samples // batch_size
total_steps = steps_per_epoch * num_epochs
bcq.fit(train_dataset,
n_steps=total_steps,
n_steps_per_epoch=steps_per_epoch,
evaluators={
'td_error': TDErrorEvaluator(episodes=test_dataset),
'value_scale': AverageValueEstimationEvaluator(episodes=test_dataset),
"init_value": InitialStateValueEstimationEvaluator(episodes=test_dataset),
})
bcq.save_model('bcq_model_test_stijn_model.pt')
bcq.save_policy('bcq_policy_test_stijn_policy.pt')
2024-01-18 10:09.52 [info ] Parameters params={'observation_shape': [3], 'action_size': 1, 'config': {'type': 'bcq', 'params': {'batch_size': 100, 'gamma': 0.99, 'observation_scaler': {'type': 'none', 'params': {}}, 'action_scaler': {'type': 'none', 'params': {}}, 'reward_scaler': {'type': 'none', 'params': {}}, 'actor_learning_rate': 0.001, 'critic_learning_rate': 0.001, 'imitator_learning_rate': 0.001, 'actor_optim_factory': {'type': 'adam', 'params': {'betas': [0.9, 0.999], 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False}}, 'critic_optim_factory': {'type': 'adam', 'params': {'betas': [0.9, 0.999], 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False}}, 'imitator_optim_factory': {'type': 'adam', 'params': {'betas': [0.9, 0.999], 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False}}, 'actor_encoder_factory': {'type': 'default', 'params': {'activation': 'relu', 'use_batch_norm': False, 'dropout_rate': None}}, 'critic_encoder_factory': {'type': 'default', 'params': {'activation': 'relu', 'use_batch_norm': False, 'dropout_rate': None}}, 'imitator_encoder_factory': {'type': 'default', 'params': {'activation': 'relu', 'use_batch_norm': False, 'dropout_rate': None}}, 'q_func_factory': {'type': 'mean', 'params': {'share_encoder': False}}, 'tau': 0.005, 'n_critics': 2, 'update_actor_interval': 1, 'lam': 0.75, 'n_action_samples': 100, 'action_flexibility': 0.05, 'rl_start_step': 0, 'beta': 0.5}}}
Epoch 1/2: 100%|██████████| 100/100 [00:14<00:00, 6.81it/s, imitator_loss=0.326, critic_loss=nan, actor_loss=nan]
2024-01-18 10:21.29 [info ] BCQ_20240118100952: epoch=1 step=100 epoch=1 metrics={'time_sample_batch': 0.009686501026153564, 'time_algorithm_update': 0.13445483684539794, 'imitator_loss': 0.30401995055377484, 'critic_loss': nan, 'actor_loss': nan, 'time_step': 0.14449420928955078, 'td_error': nan, 'value_scale': nan, 'init_value': nan} step=100
2024-01-18 10:21.30 [info ] Model parameters are saved to d3rlpy_logs\BCQ_20240118100952\model_100.d3
Epoch 2/2: 100%|██████████| 100/100 [00:13<00:00, 7.62it/s, imitator_loss=0.0804, critic_loss=nan, actor_loss=nan]
2024-01-18 10:33.20 [info ] BCQ_20240118100952: epoch=2 step=200 epoch=2 metrics={'time_sample_batch': 0.011139976978302001, 'time_algorithm_update': 0.11810533285140991, 'imitator_loss': 0.0829407175630331, 'critic_loss': nan, 'actor_loss': nan, 'time_step': 0.129679434299469, 'td_error': nan, 'value_scale': nan, 'init_value': nan} step=200
2024-01-18 10:33.20 [info ] Model parameters are saved to d3rlpy_logs\BCQ_20240118100952\model_200.d3
######################### EVALUATE MODEL ################################
with open('train_dataset 2019 v1.h5','rb') as f:
train_dataset = ReplayBuffer.load(f,InfiniteBuffer())
with open('test_dataset 2019 v1.h5','rb') as f:
test_dataset = ReplayBuffer.load(f,InfiniteBuffer())
train_f, test_f = train_f2019, test_f2019
test_f = test_f.reset_index(drop=True)
bcq1 = BCQConfig().create(device = None)
bcq1.build_with_dataset(train_dataset)
bcq1.load_model('bcq_model_test_stijn_model.pt')
start = 0
env = BPEnv(df2019, test_f, idx_start=start)
nr_f_testing = len(test_f)
validation_f = pd.DataFrame(columns=list(df2019.columns)+['Predicted bidprice'])
actions = []
for _ in tqdm(range(nr_f_testing)):
done = False
obs = env.reset()
obs_batch = np.expand_dims(obs,axis=0)
while not done:
action = bcq1.predict(obs_batch)[0]
actions.append(action)
obs, reward, done, info = env.step(action[0])
env.calculate_predicted_BP()
# env.render()
validation_flights = pd.concat([validation_flights, env.flight]).reset_index(drop=True)
`
Hopefully you have a clear response for the problem.
Kind regards,
Stijn Meels