takuseno / d3rlpy

An offline deep reinforcement learning library

Home Page:https://takuseno.github.io/d3rlpy

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

[BUG] Actor and critic loss are nan as well as predicted action.

stijnmeels opened this issue · comments

Dear @takuseno,

I currently have transformed a BCQ d3rlpy 1.1.1 code into 2.3.0. but when (small number of steps to make sure the code runs) training the model the values for the critic loss and actor loss appear as nan whilst the imitator loss has a regular value. Also when saving this model and making predictions the action values (continuous in range [0,1]) appear to be nan. I have added the main part of my code below. Lastly the time between each epoch is around 10 minutes as you can see)
import numpy as np import pandas as pd import tqdm from d3rlpy.algos import BCQConfig from d3rlpy.dataset import MDPDataset, ReplayBuffer, InfiniteBuffer from d3rlpy.metrics import TDErrorEvaluator, AverageValueEstimationEvaluator, InitialStateValueEstimationEvaluator from sklearn.model_selection import train_test_split`

def create_MDP_dataset(df, C=[4], test_size=0.2, random_state=42):
    # extract unique flight IDs
    unique_f = df[['FL', 'FLD', 'C']].drop_duplicates()
    
    # split the unique flights into training and test sets
    train_f, test_f = train_test_split(unique_f, test_size=test_size, random_state=random_state)
    
    # create MDPDataset objects for training and test sets
    train_dataset, train_f = create_MDP_dataset_from_df(df, train_f, C)
    test_dataset, test_f = create_MDP_dataset_from_df(df, test_f, C)
return train_dataset, test_dataset, train_f, test_f
def create_MDP_dataset_from_df(df, f_df, C=[4]):
    f_list = []
    observations = []
    actions = []
    rewards = []
    terminals = []
    f_of_interest = df.merge(f_df, on=['FL', 'FLD', 'C'])
    grouped_f = f_of_interest.groupby(['FL', 'FLD', 'C'])
    for idx, (f_data, f_data) in enumerate(grouped_f):
        observations.append(np.column_stack([f_data['TF'].values, f_data['S'].values, f_data['LFG'].values]))
        actions.append(f_data['NB'].values)
        rewards.append(f_data['R'].values)
        terminals.append(np.append(np.zeros((f_data.shape[0] - 1,)), 1))
        f_list.append([f_data[0], f_data[1], f_data[2]])
    f = pd.DataFrame(f_list, columns=['FL', 'FLD', 'C'])
    observations = np.vstack(observations)
    actions = np.reshape(np.concatenate(actions), (-1, 1))
    rewards = np.concatenate(rewards)
    terminals = np.concatenate(terminals)
    dataset = MDPDataset(observations, actions, rewards, terminals)
    return dataset, f

train_dataset2019, test_dataset2019,train_f2019, test_f2019 = create_MDP_dataset(df2019)                   

############################## TRAIN MODEL #####################################
with open('train_dataset 2019 v1.h5','w+b') as f:
    train_dataset2019.dump(f)
with open('test_dataset 2019 v1.h5','w+b') as f:
    test_dataset2019.dump(f)

with open('train_dataset 2019 v1.h5','rb') as f:
    train_dataset = ReplayBuffer.load(f,InfiniteBuffer())
with open('test_dataset 2019 v1.h5','rb') as f:
    test_dataset = ReplayBuffer.load(f,InfiniteBuffer())    

bcq = BCQConfig().create(device = None)

bcq.build_with_dataset(train_dataset)


batch_size = 100
num_epochs = 100
train_episodes = train_dataset.episodes
total_samples = len(train_episodes) * len(train_episodes[0])
steps_per_epoch = total_samples // batch_size
total_steps = steps_per_epoch * num_epochs

bcq.fit(train_dataset,
        n_steps=total_steps,
        n_steps_per_epoch=steps_per_epoch,
        evaluators={
            'td_error': TDErrorEvaluator(episodes=test_dataset),
            'value_scale': AverageValueEstimationEvaluator(episodes=test_dataset),
            "init_value": InitialStateValueEstimationEvaluator(episodes=test_dataset),
        })
bcq.save_model('bcq_model_test_stijn_model.pt')
bcq.save_policy('bcq_policy_test_stijn_policy.pt')

2024-01-18 10:09.52 [info ] Parameters params={'observation_shape': [3], 'action_size': 1, 'config': {'type': 'bcq', 'params': {'batch_size': 100, 'gamma': 0.99, 'observation_scaler': {'type': 'none', 'params': {}}, 'action_scaler': {'type': 'none', 'params': {}}, 'reward_scaler': {'type': 'none', 'params': {}}, 'actor_learning_rate': 0.001, 'critic_learning_rate': 0.001, 'imitator_learning_rate': 0.001, 'actor_optim_factory': {'type': 'adam', 'params': {'betas': [0.9, 0.999], 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False}}, 'critic_optim_factory': {'type': 'adam', 'params': {'betas': [0.9, 0.999], 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False}}, 'imitator_optim_factory': {'type': 'adam', 'params': {'betas': [0.9, 0.999], 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False}}, 'actor_encoder_factory': {'type': 'default', 'params': {'activation': 'relu', 'use_batch_norm': False, 'dropout_rate': None}}, 'critic_encoder_factory': {'type': 'default', 'params': {'activation': 'relu', 'use_batch_norm': False, 'dropout_rate': None}}, 'imitator_encoder_factory': {'type': 'default', 'params': {'activation': 'relu', 'use_batch_norm': False, 'dropout_rate': None}}, 'q_func_factory': {'type': 'mean', 'params': {'share_encoder': False}}, 'tau': 0.005, 'n_critics': 2, 'update_actor_interval': 1, 'lam': 0.75, 'n_action_samples': 100, 'action_flexibility': 0.05, 'rl_start_step': 0, 'beta': 0.5}}}

Epoch 1/2: 100%|██████████| 100/100 [00:14<00:00, 6.81it/s, imitator_loss=0.326, critic_loss=nan, actor_loss=nan]

2024-01-18 10:21.29 [info ] BCQ_20240118100952: epoch=1 step=100 epoch=1 metrics={'time_sample_batch': 0.009686501026153564, 'time_algorithm_update': 0.13445483684539794, 'imitator_loss': 0.30401995055377484, 'critic_loss': nan, 'actor_loss': nan, 'time_step': 0.14449420928955078, 'td_error': nan, 'value_scale': nan, 'init_value': nan} step=100

2024-01-18 10:21.30 [info ] Model parameters are saved to d3rlpy_logs\BCQ_20240118100952\model_100.d3

Epoch 2/2: 100%|██████████| 100/100 [00:13<00:00, 7.62it/s, imitator_loss=0.0804, critic_loss=nan, actor_loss=nan]

2024-01-18 10:33.20 [info ] BCQ_20240118100952: epoch=2 step=200 epoch=2 metrics={'time_sample_batch': 0.011139976978302001, 'time_algorithm_update': 0.11810533285140991, 'imitator_loss': 0.0829407175630331, 'critic_loss': nan, 'actor_loss': nan, 'time_step': 0.129679434299469, 'td_error': nan, 'value_scale': nan, 'init_value': nan} step=200

2024-01-18 10:33.20 [info ] Model parameters are saved to d3rlpy_logs\BCQ_20240118100952\model_200.d3

######################### EVALUATE MODEL ################################
                        
with open('train_dataset 2019 v1.h5','rb') as f:
    train_dataset = ReplayBuffer.load(f,InfiniteBuffer())
with open('test_dataset 2019 v1.h5','rb') as f:
    test_dataset = ReplayBuffer.load(f,InfiniteBuffer())                           



train_f, test_f = train_f2019, test_f2019
test_f = test_f.reset_index(drop=True)

bcq1 = BCQConfig().create(device = None)
bcq1.build_with_dataset(train_dataset)
bcq1.load_model('bcq_model_test_stijn_model.pt')


start = 0  
env = BPEnv(df2019, test_f, idx_start=start)
nr_f_testing = len(test_f) 
validation_f = pd.DataFrame(columns=list(df2019.columns)+['Predicted bidprice'])
actions = []
for _ in tqdm(range(nr_f_testing)):
    done = False
    obs = env.reset()
    obs_batch = np.expand_dims(obs,axis=0)
    while not done:
        action = bcq1.predict(obs_batch)[0]
        actions.append(action)
        obs, reward, done, info = env.step(action[0])
    env.calculate_predicted_BP()
    # env.render()
    validation_flights = pd.concat([validation_flights, env.flight]).reset_index(drop=True)

`

Hopefully you have a clear response for the problem.

Kind regards,

Stijn Meels