Too much time spent in 'Other' category in execution graph

Question

Too much time spent in 'Other' category in execution graph

shradhasehgal opened this issue a year ago · comments

Hi, could you clarify what all the 'Other' category covers in the execution time graph? The script below spends nearly 60% of the time in the 'Other' category, but it is just performing PyTorch dataloading and model training.

I read the docs and it seems the existing categories provide good coverage (kernel, memcpy, communication, runtime, dataloader, CPU exec), so could you provide some examples for what kind of things the 'other' category could cover?

I am trying to understand what it is in the below script that causes it to spend majority of its time in 'Other' operations.

import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor
from torch.profiler import ProfilerActivity, profile, schedule

device = "cuda" if torch.cuda.is_available() else "cpu"
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28 * 28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10)
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits
    
training_data = datasets.FashionMNIST(
        root='./data',
        train=True,
        download=True,
        transform=ToTensor())

dataloader = DataLoader(
        training_data,
        batch_size=32)

model: nn.Module = NeuralNetwork()
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
model.to(device)
model.train()


with profile(
    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
    schedule=schedule(skip_first=1, wait=1, warmup=1, active=3, repeat=1),
    on_trace_ready=torch.profiler.tensorboard_trace_handler('/tmp/logs'),
    record_shapes=True,
    with_stack=True,
    profile_memory=True) as p:

    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)
        output = model(X)
        loss = loss_fn(output, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        p.step()