pytorch / kineto

A CPU+GPU Profiling library that provides access to timeline traces and hardware performance counters.

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

Call Stack not being recorded for torch_tb_profiler

jeromeku opened this issue · comments

Running the example does not track stack traces even though with_stack is set to 1.

Here is my env:

tensorboard                  2.13.0
torch                        2.0.1
torch-tb-profiler            0.4.1 

Here is an excerpt from the tensorboard trace:

{
  "schemaVersion": 1,
  "deviceProperties": [
    {
      "id": 0, "name": "NVIDIA RTX A6000", "totalGlobalMem": 51041271808,
      "computeMajor": 8, "computeMinor": 6,
      "maxThreadsPerBlock": 1024, "maxThreadsPerMultiprocessor": 1536,
      "regsPerBlock": 65536, "regsPerMultiprocessor": 65536, "warpSize": 32,
      "sharedMemPerBlock": 49152, "sharedMemPerMultiprocessor": 102400,
      "numSms": 84, "sharedMemPerBlockOptin": 101376
    }
  ],
  "record_shapes": 1,
  "with_stack": 1,
  "profile_memory": 1,
  "traceEvents": [
  {
    "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: NllLossBackward0", "pid": 18462, "tid": 18549,
    "ts": 1688618105515688, "dur": 176,
    "args": {
      "External id": 1025,"Ev Idx": 0, "Fwd thread id": 1, "Sequence number": 533
    }
  },
  {
    "ph": "X", "cat": "cpu_op", "name": "NllLossBackward0", "pid": 18462, "tid": 18549,
    "ts": 1688618105515717, "dur": 136,
    "args": {
      "External id": 1026,"Ev Idx": 1, "Input Dims": [[]], "Input type": ["float"], "Fwd thread id": 1, "Sequence number": 533
    }
  },
  {
    "ph": "X", "cat": "cpu_op", "name": "aten::nll_loss_backward", "pid": 18462, "tid": 18549,
    "ts": 1688618105515737, "dur": 115,
    "args": {
      "External id": 1027,"Ev Idx": 2, "Input Dims": [[], [32, 1000], [32], [], [], [], []], "Input type": ["float", "float", "long int", "", "Scalar", "Scalar", "float"]
    }
  },
  {
    "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 18462, "tid": 18549,
    "ts": 1688618105515766, "dur": 70,
    "args": {
      "External id": 1028,"Ev Idx": 3, "Input Dims": [[32, 1000]], "Input type": ["float"]
    }
  },
  {
    "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 18462, "tid": 18549,
    "ts": 1688618105515771, "dur": 62,
    "args": {
      "External id": 1029,"Ev Idx": 4, "Input Dims": [[32, 1000], []], "Input type": ["float", "Scalar"]
    }
  },
  {
    "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: LogSoftmaxBackward0", "pid": 18462, "tid": 18549,
    "ts": 1688618105515875, "dur": 48,
    "args": {
      "External id": 1030,"Ev Idx": 5, "Fwd thread id": 1, "Sequence number": 532
    }
  },
  {
    "ph": "X", "cat": "cpu_op", "name": "LogSoftmaxBackward0", "pid": 18462, "tid": 18549,
    "ts": 1688618105515877, "dur": 40,
    "args": {
      "External id": 1031,"Ev Idx": 6, "Input Dims": [[32, 1000]], "Input type": ["float"], "Fwd thread id": 1, "Sequence number": 532
    }
  },
  {
    "ph": "X", "cat": "cpu_op", "name": "aten::_log_softmax_backward_data", "pid": 18462, "tid": 18549,
    "ts": 1688618105515885, "dur": 31,
    "args": {
      "External id": 1032,"Ev Idx": 7, "Input Dims": [[32, 1000], [32, 1000], [], []], "Input type": ["float", "float", "Scalar", "Scalar"]
    }
  },
  {
    "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddmmBackward0", "pid": 18462, "tid": 18549,
    "ts": 1688618105515930, "dur": 247,
    "args": {
      "External id": 1033,"Ev Idx": 8, "Fwd thread id": 1, "Sequence number": 531
    }
  },
  {
    "ph": "X", "cat": "cpu_op", "name": "AddmmBackward0", "pid": 18462, "tid": 18549,
    "ts": 1688618105515932, "dur": 177,
    "args": {
      "External id": 1034,"Ev Idx": 9, "Input Dims": [[32, 1000]], "Input type": ["float"], "Fwd thread id": 1, "Sequence number": 531
    }
  },
  {
    "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 18462, "tid": 18549,
    "ts": 1688618105515937, "dur": 18,
    "args": {
      "External id": 1035,"Ev Idx": 10, "Input Dims": [[2048, 1000]], "Input type": ["float"]
    }
  },
  {
    "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 18462, "tid": 18549,
    "ts": 1688618105515942, "dur": 11,
    "args": {
      "External id": 1036,"Ev Idx": 11, "Input Dims": [[2048, 1000], [], []], "Input type": ["float", "Scalar", "Scalar"]
    }
  },
  {
    "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 18462, "tid": 18549,
    "ts": 1688618105515950, "dur": 2,
    "args": {
      "External id": 1037,"Ev Idx": 12, "Input Dims": [[2048, 1000], [], [], []], "Input type": ["float", "", "", ""]
    }
  },
  {
    "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 18462, "tid": 18549,
    "ts": 1688618105515957, "dur": 90,
    "args": {
      "External id": 1038,"Ev Idx": 13, "Input Dims": [[32, 1000], [1000, 2048]], "Input type": ["float", "float"]
    }
  },
  {
    "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 18462, "tid": 18549,
    "ts": 1688618105516050, "dur": 7,
    "args": {
      "External id": 1039,"Ev Idx": 14, "Input Dims": [[32, 1000]], "Input type": ["float"]
    }
  },
  {
    "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 18462, "tid": 18549,
    "ts": 1688618105516052, "dur": 4,
    "args": {
      "External id": 1040,"Ev Idx": 15, "Input Dims": [[32, 1000], [], []], "Input type": ["float", "Scalar", "Scalar"]
    }
  },
  {
    "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 18462, "tid": 18549,
    "ts": 1688618105516054, "dur": 2,
    "args": {
      "External id": 1041,"Ev Idx": 16, "Input Dims": [[32, 1000], [], [], []], "Input type": ["float", "", "", ""]
    }
  },
  {
    "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 18462, "tid": 18549,
    "ts": 1688618105516059, "dur": 43,
    "args": {
      "External id": 1042,"Ev Idx": 17, "Input Dims": [[1000, 32], [32, 2048]], "Input type": ["float", "float"]
    }
  },
  {
    "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 18462, "tid": 18549,
    "ts": 1688618105516104, "dur": 4,
    "args": {
      "External id": 1043,"Ev Idx": 18, "Input Dims": [[1000, 2048]], "Input type": ["float"]
    }
  },
  {
    "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 18462, "tid": 18549,
    "ts": 1688618105516105, "dur": 2,
    "args": {
      "External id": 1044,"Ev Idx": 19, "Input Dims": [[1000, 2048], [], []], "Input type": ["float", "Scalar", "Scalar"]
    }
  },
  {
    "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 18462, "tid": 18549,
    "ts": 1688618105516106, "dur": 1,
    "args": {
      "External id": 1045,"Ev Idx": 20, "Input Dims": [[1000, 2048], [], [], []], "Input type": ["float", "", "", ""]
    }
  },
  {
    "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 18462, "tid": 18549,
    "ts": 1688618105516113, "dur": 41,
    "args": {
      "External id": 1046,"Ev Idx": 21, "Input Dims": [[32, 1000], [], [], []], "Input type": ["float", "", "Scalar", ""]
    }
  },
  {
    "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 18462, "tid": 18549,
    "ts": 1688618105516156, "dur": 4,
    "args": {
      "External id": 1047,"Ev Idx": 22, "Input Dims": [[1, 1000], []], "Input type": ["float", ""]
    }
  },
  {
    "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 18462, "tid": 18549,
    "ts": 1688618105516192, "dur": 18,
    "args": {
      "External id": 1048,"Ev Idx": 23
    }
  },
  {
    "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 18462, "tid": 18549,
    "ts": 1688618105516195, "dur": 11,
    "args": {
      "External id": 1049,"Ev Idx": 24, "Input Dims": [[1000]], "Input type": ["float"]
    }
  },
  {
    "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 18462, "tid": 18549,
    "ts": 1688618105516200, "dur": 6,
    "args": {
      "External id": 1050,"Ev Idx": 25, "Input Dims": [[1000]], "Input type": ["float"]
    }
  },

When compared to the traces in the samples folder, I see that there is a call stack entry for each operator but no such entry in the above trace.

Viewing the Operator tab in tensorboard also shows that View Call Stack links are all greyed out.