Running `test` with LightningCLI, the program can quit before the test loop ends

Question

Running `test` with LightningCLI, the program can quit before the test loop ends

t4rf9 opened this issue 3 months ago · comments

Bug description

Within my LightningModule, I used self.log_dict(metrics, on_step=True, on_epoch=True) in test_step, and run with python main.py test --config config.yaml, with main.py containing only cli = LightningCLI(), and config.yaml providing both the datasets and model. The TensorBoardLogger is used.

However, after the programs ends, sometimes I can normally get the metrics epoch, test_accuracy_epoch and test_loss_epoch in the logger file, but at most attempts these 3 metrics didn't show up, and step-level logged objects can always be seen normally.

When the problems occurs, nothing abnormal can be seen from command line outputs. It looks as if the program quited normally.

I find a walkaround to be sleeping for a while in main.py right after cli = LightningCLI(). It seems like this is because a child thread is not waited to the end.

What version are you seeing the problem on?

v2.2

How to reproduce the bug

main.py

from lightning.pytorch.cli import LightningCLI
from lightning.pytorch.loggers import TensorBoardLogger
from lightning.pytorch.callbacks import ModelCheckpoint

from model import Model
from datamodule import DataModule


def cli_main():
    cli = LightningCLI()


if __name__ == "__main__":
    cli_main()

    from time import sleep
    sleep(2)
    # The problem can be solved by adding sleep.

config.yaml

# lightning.pytorch==2.2.5
ckpt_path: null
seed_everything: 0
model:
  class_path: model.Model
  init_args:
    learning_rate: 1e-3
data:
  class_path: datamodule.DataModule
  init_args:
    data_dir: data
trainer:
  accelerator: gpu
  strategy: auto
  devices: 1
  num_nodes: 1
  precision: null
  fast_dev_run: false
  max_epochs: 100
  min_epochs: null
  max_steps: -1
  min_steps: null
  max_time: null
  limit_train_batches: null
  limit_val_batches: 10
  limit_test_batches: null
  limit_predict_batches: null
  logger:
    class_path: lightning.pytorch.loggers.TensorBoardLogger
    init_args:
      save_dir: lightning_logs/resnet50
      name: normalized
  callbacks:
    class_path: lightning.pytorch.callbacks.ModelCheckpoint
    init_args:
      save_top_k: 5
      monitor: valid_loss
      filename: "{epoch}-{step}-{valid_loss:.8f}"
  overfit_batches: 0.0
  val_check_interval: 50
  check_val_every_n_epoch: 1
  num_sanity_val_steps: null
  log_every_n_steps: 50
  enable_checkpointing: null
  enable_progress_bar: null
  enable_model_summary: null
  accumulate_grad_batches: 1
  gradient_clip_val: null
  gradient_clip_algorithm: null
  deterministic: false
  benchmark: null
  inference_mode: true
  use_distributed_sampler: true
  profiler: null
  detect_anomaly: false
  barebones: false
  plugins: null
  sync_batchnorm: true
  reload_dataloaders_every_n_epochs: 0
  default_root_dir: null

model.py

import torch
from torch import nn
import torch.nn.functional as F
import lightning as pl
from torchvision.models import resnet50


class Model(pl.LightningModule):
    def __init__(self, learning_rate: float):
        super().__init__()

        self.save_hyperparameters()

        CHARS = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
        class_num = len(CHARS)
        self.text_len = 4

        resnet = resnet50()
        resnet.conv1 = nn.Conv2d(
            1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False
        )
        layers = list(resnet.children())
        self.resnet = nn.Sequential(*layers[:9])
        self.linear = nn.Linear(512, class_num)
        self.softmax = nn.Softmax(2)

    def _calc_softmax(self, x: torch.Tensor) -> torch.Tensor:
        x = self.resnet(x)  # (batch, 2048, 1, 1)
        x = x.reshape(x.shape[0], self.text_len, -1)  # (batch, 4, 512)
        x = self.linear(x)  # (batch, 4, 62)
        x = self.softmax(x)  # (batch, 4, 62)
        return x

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # in lightning, forward defines the prediction/inference actions
        x = self._calc_softmax(x)  # (batch, 4, 62)
        return torch.argmax(x, 2)  # (batch, 4)

    def training_step(self, batch: torch.Tensor, batch_idx: int) -> torch.Tensor:
        # training_step defined the train loop.
        # It is independent of forward
        img, target = batch
        batch_size = img.shape[0]

        pred_softmax = self._calc_softmax(img)  # (batch, 4, 62)
        pred_softmax_permute = pred_softmax.permute((0, 2, 1))  # (batch, 62, 4)

        loss = F.cross_entropy(pred_softmax_permute, target)

        with torch.no_grad():
            pred = torch.argmax(pred_softmax, 2)  # (batch, 4)
            char_correct = (pred == target).sum(1)  # (batch)
            batch_correct = (char_correct == self.text_len).sum()
            batch_accuracy = batch_correct / batch_size

        metrics = {"train_accuracy": batch_accuracy, "train_loss": loss}
        self.log_dict(metrics, prog_bar=True, logger=True, on_step=True, on_epoch=True)

        return loss

    def configure_optimizers(self) -> torch.optim.Optimizer:
        optimizer = torch.optim.Adam(self.parameters(), lr=self.hparams.learning_rate)
        return optimizer

    def validation_step(self, batch: torch.Tensor, batch_idx: int) -> torch.Tensor:
        # validation_step defined the validation loop.
        # It is independent of forward
        img, target = batch
        batch_size = img.shape[0]

        pred_softmax = self._calc_softmax(img)  # (batch, 4, 62)
        pred_softmax_permute = pred_softmax.permute((0, 2, 1))  # (batch, 62, 4)

        loss = F.cross_entropy(pred_softmax_permute, target)

        with torch.no_grad():
            pred = torch.argmax(pred_softmax, 2)  # (batch, 4)
            char_correct = (pred == target).sum(1)  # (batch)
            batch_correct = (char_correct == self.text_len).sum()
            batch_accuracy = batch_correct / batch_size

        metrics = {"valid_accurary": batch_accuracy, "valid_loss": loss}
        self.log_dict(metrics, prog_bar=True, logger=True, on_step=True, on_epoch=True)

        return loss

    def test_step(self, batch: torch.Tensor, batch_idx: int) -> torch.Tensor:
        # test_step defined the test loop.
        # It is independent of forward
        img, target = batch
        batch_size = img.shape[0]

        pred_softmax = self._calc_softmax(img)  # (batch, 4, 62)
        pred_softmax_permute = pred_softmax.permute((0, 2, 1))  # (batch, 62, 4)

        loss = F.cross_entropy(pred_softmax_permute, target)

        with torch.no_grad():
            pred = torch.argmax(pred_softmax, 2)  # (batch, 4)
            char_correct = (pred == target).sum(1)  # (batch)
            batch_correct = (char_correct == self.text_len).sum()
            batch_accuracy = batch_correct / batch_size

        metrics = {"test_accurary": batch_accuracy, "test_loss": loss}
        self.log_dict(metrics, prog_bar=True, logger=True, on_step=True, on_epoch=True)
        ## The `on_epoch` part of behaviors are unstable, but `test_accuracy_step` can always be seen.
        ## If `on_step=False` and `on_epoch=True`, it works fine to me.


        return loss



### Error messages and logs

Error messages and logs here please



### Environment

<details>
  <summary>Current environment</summary>

PyTorch Lightning Version: 2.2.5
PyTorch Version: 2.3.1+cu121
Python version: 3.12.4
OS: Windows 11
CUDA/cuDNN version: 12.1
GPU models and configuration: GTX 1650
How you installed Lightning: pip


</details>


### More info

_No response_