Benchmark failure in lit-gpt falcon-7b model

Question

Benchmark failure in lit-gpt falcon-7b model

xwang233 opened this issue 5 months ago · comments

reproduce:

pjnl-20240516, H100

root@7a32d69c1587:/opt/pytorch/lightning-thunder# NVFUSER_DISABLE=parallel_compile torchrun --nproc_per_node 2 --nnodes 1 thunder/benchmarks/benchmark_litgpt.py --model_name falcon-7b     --distributed_mode ddp     --compile thunder_cudnn

(part of the ) stacktrace:

An error occurred while executing nvFuser FusionDefinition 7.
If you believe this is a bug or need assistance, please file an issue at https://github.com/NVIDIA/Fuser/issues/new
Here's a script to reproduce the error:

import torch
from nvfuser import FusionDefinition, DataType

def nvfuser_fusion_id7(fd : FusionDefinition) -> None :
    T0 = fd.define_tensor(shape=[1, -1, -1, -1], contiguity=[None, None, True, True], dtype=DataType.Float, is_cpu=False, stride_order=[3, 2, 1, 0])
    T1 = fd.define_tensor(shape=[1, -1, -1, -1], contiguity=[None, None, True, True], dtype=DataType.Float, is_cpu=False, stride_order=[3, 2, 1, 0])
    T2 = fd.define_tensor(shape=[1, -1, -1, -1], contiguity=[False, None, True, True], dtype=DataType.BFloat16, is_cpu=False, stride_order=[2, 1, 3, 0])
    T3 = fd.define_tensor(shape=[1, -1, -1, -1], contiguity=[False, None, False, True], dtype=DataType.BFloat16, is_cpu=False, stride_order=[2, 1, 3, 0])
    T4 = fd.define_tensor(shape=[1, -1, -1, -1], contiguity=[False, None, False, True], dtype=DataType.BFloat16, is_cpu=False, stride_order=[2, 1, 3, 0])
    T5 = fd.define_tensor(shape=[1, -1, -1, -1], contiguity=[None, None, False, True], dtype=DataType.BFloat16, is_cpu=False, stride_order=[3, 2, 1, 0])
    T6 = fd.define_tensor(shape=[1, -1, -1, -1], contiguity=[None, None, False, True], dtype=DataType.BFloat16, is_cpu=False, stride_order=[3, 2, 1, 0])
    T7 = fd.define_tensor(shape=[1, -1, -1, -1], contiguity=[None, None, False, True], dtype=DataType.BFloat16, is_cpu=False, stride_order=[3, 2, 1, 0])
    T8 = fd.define_tensor(shape=[1, -1, -1, -1], contiguity=[False, None, False, True], dtype=DataType.BFloat16, is_cpu=False, stride_order=[2, 1, 3, 0])
    T9 = fd.define_tensor(shape=[1, -1, -1, -1], contiguity=[None, None, False, True], dtype=DataType.BFloat16, is_cpu=False, stride_order=[3, 2, 1, 0])
    T10 = fd.define_tensor(shape=[1, -1, -1], contiguity=[None, True, True], dtype=DataType.BFloat16, is_cpu=False, stride_order=[2, 1, 0])
    T11 = fd.ops.cast(T10, dtype=DataType.Float)
    S12 = fd.define_scalar(1.41421, dtype=DataType.Double)
    S13 = fd.ops.reciprocal(S12)
    T14 = fd.ops.mul(T11, S13)
    T15 = fd.ops.erf(T14)
    S16 = fd.define_scalar(0.500000, dtype=DataType.Double)
    T17 = fd.ops.mul(S16, T15)
    S18 = fd.define_scalar(0.500000, dtype=DataType.Double)
    T19 = fd.ops.add(S18, T17)
    T20 = fd.ops.mul(T11, T19)
    T21 = fd.ops.cast(T20, dtype=DataType.BFloat16)
    T22 = fd.ops.cast(T4, dtype=DataType.Float)
    T23 = fd.ops.neg(T22)
    T24 = fd.ops.cast(T23, dtype=DataType.BFloat16)
    T25 = fd.ops.cat([T24, T3], dim=-1)
    T26 = fd.ops.cast(T2, dtype=DataType.Float)
    T27 = fd.ops.mul(T26, T0)
    T28 = fd.ops.cast(T25, dtype=DataType.Float)
    T29 = fd.ops.mul(T28, T1)
    T30 = fd.ops.add(T27, T29)
    T31 = fd.ops.cast(T30, dtype=DataType.BFloat16)
    T32 = fd.ops.cast(T7, dtype=DataType.Float)
    T33 = fd.ops.neg(T32)
    T34 = fd.ops.cast(T33, dtype=DataType.BFloat16)
    T35 = fd.ops.cat([T34, T6], dim=-1)
    T36 = fd.ops.cast(T5, dtype=DataType.Float)
    T37 = fd.ops.mul(T36, T0)
    T38 = fd.ops.cast(T35, dtype=DataType.Float)
    T39 = fd.ops.mul(T38, T1)
    T40 = fd.ops.add(T37, T39)
    T41 = fd.ops.cast(T40, dtype=DataType.BFloat16)
    T42 = fd.ops.cat([T31, T8], dim=-1)
    T43 = fd.ops.cat([T41, T9], dim=-1)
    fd.add_output(T42)
    fd.add_output(T43)
    fd.add_output(T21)

with FusionDefinition() as fd:
    nvfuser_fusion_id7(fd)

inputs = [
    torch.randn((131072,), dtype=torch.float32, device='cuda:1').as_strided((1, 71, 2048, 64), (131072, 0, 64, 1)),
    torch.randn((131072,), dtype=torch.float32, device='cuda:1').as_strided((1, 71, 2048, 64), (131072, 0, 64, 1)),
    torch.randn((9568128,), dtype=torch.bfloat16, device='cuda:1').as_strided((1, 71, 2048, 64), (4544, 64, 4672, 1)),
    torch.randn((9568096,), dtype=torch.bfloat16, device='cuda:1').as_strided((1, 71, 2048, 32), (4544, 64, 4672, 1)),
    torch.randn((9568096,), dtype=torch.bfloat16, device='cuda:1').as_strided((1, 71, 2048, 32), (4544, 64, 4672, 1)),
    torch.randn((9563648,), dtype=torch.bfloat16, device='cuda:1').as_strided((1, 71, 2048, 64), (0, 0, 4672, 1)),
    torch.randn((9563616,), dtype=torch.bfloat16, device='cuda:1').as_strided((1, 71, 2048, 32), (0, 0, 4672, 1)),
    torch.randn((9563616,), dtype=torch.bfloat16, device='cuda:1').as_strided((1, 71, 2048, 32), (0, 0, 4672, 1)),
    torch.randn((0,), dtype=torch.bfloat16, device='cuda:1').as_strided((1, 71, 2048, 0), (4544, 64, 4672, 1)),
    torch.randn((0,), dtype=torch.bfloat16, device='cuda:1').as_strided((1, 71, 2048, 0), (0, 0, 4672, 1)),
    torch.randn((37224448,), dtype=torch.bfloat16, device='cuda:1').as_strided((1, 2048, 18176), (37224448, 18176, 1)),
]
fd.execute(inputs)


Traceback (most recent call last):
  File "/opt/pytorch/nvfuser/nvfuser/__init__.py", line 139, in execute
    result = self._execute(
RuntimeError: _result == CUDA_SUCCESS INTERNAL ASSERT FAILED at "/opt/pytorch/nvfuser/csrc/executor_utils.cpp":888, please report a bug with repro script to NVFuser at https://github.com/NVIDIA/Fuser/issues. CUDA error: CUDA_ERROR_ILLEGAL_ADDRESS failed with error an illegal memory access was encountered
Exception raised from invoke at /opt/pytorch/nvfuser/csrc/executor_utils.cpp:888 (most recent call first):
frame #0: nvfuser::nvfCheckFail(char const*, char const*, unsigned int, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0xf3 (0x7fb5375c7555 in /opt/pytorch/nvfuser/nvfuser/_C.cpython-310-x86_64-linux-gnu.so)
frame #1: nvfuser::nvfErrorFail(char const*, char const*, unsigned int, char const*, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0x53 (0x7fb5378c5823 in /opt/pytorch/nvfuser/nvfuser/_C.cpython-310-x86_64-linux-gnu.so)
frame #2: <unknown function> + 0x455e42 (0x7fb5378fae42 in /opt/pytorch/nvfuser/nvfuser/_C.cpython-310-x86_64-linux-gnu.so)
frame #3: <unknown function> + 0x4580e6 (0x7fb5378fd0e6 in /opt/pytorch/nvfuser/nvfuser/_C.cpython-310-x86_64-linux-gnu.so)
frame #4: nvfuser::FusionExecutor::compileFusion(nvfuser::Fusion*, nvfuser::KernelArgumentHolder const&, nvfuser::LaunchParams const&, nvfuser::CompileParams, nvfuser::ScheduleHeuristic, long, long, long, long) + 0x14a9 (0x7fb5378ddec9 in /opt/pytorch/nvfuser/nvfuser/_C.cpython-310-x86_64-linux-gnu.so)
frame #5: <unknown function> + 0x5df922 (0x7fb537a84922 in /opt/pytorch/nvfuser/nvfuser/_C.cpython-310-x86_64-linux-gnu.so)
frame #6: nvfuser::FusionKernelRuntime::compileFusionParallel(nvfuser::KernelArgumentHolder) + 0x447 (0x7fb537a8c307 in /opt/pytorch/nvfuser/nvfuser/_C.cpython-310-x86_64-linux-gnu.so)
frame #7: nvfuser::FusionExecutorCache::runFusionWithInputs(c10::ArrayRef<c10::IValue> const&, std::optional<nvfuser::PrimDataType>, std::optional<signed char>) + 0xad3 (0x7fb537a97f23 in /opt/pytorch/nvfuser/nvfuser/_C.cpython-310-x86_64-linux-gnu.so)
frame #8: nvfuser::python_frontend::FusionDefinition::execute(c10::ArrayRef<c10::IValue> const&, bool, bool, std::optional<signed char>) const + 0x3c8 (0x7fb537c76488 in /opt/pytorch/nvfuser/nvfuser/_C.cpython-310-x86_64-linux-gnu.so)
frame #9: <unknown function> + 0x18f7f5 (0x7fb5376347f5 in /opt/pytorch/nvfuser/nvfuser/_C.cpython-310-x86_64-linux-gnu.so)
frame #10: <unknown function> + 0x203812 (0x7fb5376a8812 in /opt/pytorch/nvfuser/nvfuser/_C.cpython-310-x86_64-linux-gnu.so)
frame #11: <unknown function> + 0x28b870 (0x7fb537730870 in /opt/pytorch/nvfuser/nvfuser/_C.cpython-310-x86_64-linux-gnu.so)
frame #12: <unknown function> + 0x15a10e (0x55f83485510e in /usr/bin/python)
frame #13: _PyObject_MakeTpCall + 0x25b (0x55f83484ba7b in /usr/bin/python)
frame #14: <unknown function> + 0x168acb (0x55f834863acb in /usr/bin/python)
frame #15: _PyEval_EvalFrameDefault + 0x198c (0x55f83483f53c in /usr/bin/python)
frame #16: <unknown function> + 0x16893e (0x55f83486393e in /usr/bin/python)
frame #17: _PyEval_EvalFrameDefault + 0x2a27 (0x55f8348405d7 in /usr/bin/python)
frame #18: _PyObject_FastCallDictTstate + 0xc4 (0x55f83484ac14 in /usr/bin/python)
frame #19: _PyObject_Call_Prepend + 0xc1 (0x55f8348608d1 in /usr/bin/python)
frame #20: <unknown function> + 0x280700 (0x55f83497b700 in /usr/bin/python)
frame #21: _PyObject_MakeTpCall + 0x25b (0x55f83484ba7b in /usr/bin/python)
frame #22: _PyEval_EvalFrameDefault + 0x64e6 (0x55f834844096 in /usr/bin/python)
frame #23: _PyFunction_Vectorcall + 0x7c (0x55f8348559fc in /usr/bin/python)
frame #24: _PyEval_EvalFrameDefault + 0x2a27 (0x55f8348405d7 in /usr/bin/python)
frame #25: _PyFunction_Vectorcall + 0x7c (0x55f8348559fc in /usr/bin/python)
frame #26: _PyEval_EvalFrameDefault + 0x2a27 (0x55f8348405d7 in /usr/bin/python)
frame #27: _PyFunction_Vectorcall + 0x7c (0x55f8348559fc in /usr/bin/python)
frame #28: _PyEval_EvalFrameDefault + 0x2a27 (0x55f8348405d7 in /usr/bin/python)
frame #29: _PyFunction_Vectorcall + 0x7c (0x55f8348559fc in /usr/bin/python)
frame #30: _PyEval_EvalFrameDefault + 0x2a27 (0x55f8348405d7 in /usr/bin/python)
frame #31: _PyFunction_Vectorcall + 0x7c (0x55f8348559fc in /usr/bin/python)
frame #32: _PyEval_EvalFrameDefault + 0x2a27 (0x55f8348405d7 in /usr/bin/python)
frame #33: <unknown function> + 0x16893e (0x55f83486393e in /usr/bin/python)
frame #34: _PyEval_EvalFrameDefault + 0x2a27 (0x55f8348405d7 in /usr/bin/python)
frame #35: <unknown function> + 0x16893e (0x55f83486393e in /usr/bin/python)
frame #36: _PyEval_EvalFrameDefault + 0x2a27 (0x55f8348405d7 in /usr/bin/python)
frame #37: _PyObject_FastCallDictTstate + 0xc4 (0x55f83484ac14 in /usr/bin/python)
frame #38: _PyObject_Call_Prepend + 0x5c (0x55f83486086c in /usr/bin/python)
frame #39: <unknown function> + 0x280700 (0x55f83497b700 in /usr/bin/python)
frame #40: _PyObject_MakeTpCall + 0x25b (0x55f83484ba7b in /usr/bin/python)
frame #41: _PyEval_EvalFrameDefault + 0x64e6 (0x55f834844096 in /usr/bin/python)
frame #42: _PyFunction_Vectorcall + 0x7c (0x55f8348559fc in /usr/bin/python)
frame #43: _PyEval_EvalFrameDefault + 0x6bd (0x55f83483e26d in /usr/bin/python)
frame #44: _PyFunction_Vectorcall + 0x7c (0x55f8348559fc in /usr/bin/python)
frame #45: _PyEval_EvalFrameDefault + 0x8ac (0x55f83483e45c in /usr/bin/python)
frame #46: _PyFunction_Vectorcall + 0x7c (0x55f8348559fc in /usr/bin/python)
frame #47: PyObject_Call + 0x122 (0x55f834864492 in /usr/bin/python)
frame #48: _PyEval_EvalFrameDefault + 0x2a27 (0x55f8348405d7 in /usr/bin/python)
frame #49: _PyFunction_Vectorcall + 0x7c (0x55f8348559fc in /usr/bin/python)
frame #50: _PyEval_EvalFrameDefault + 0x6bd (0x55f83483e26d in /usr/bin/python)
frame #51: _PyFunction_Vectorcall + 0x7c (0x55f8348559fc in /usr/bin/python)
frame #52: _PyEval_EvalFrameDefault + 0x6bd (0x55f83483e26d in /usr/bin/python)
frame #53: <unknown function> + 0x13f9c6 (0x55f83483a9c6 in /usr/bin/python)
frame #54: PyEval_EvalCode + 0x86 (0x55f834930256 in /usr/bin/python)
frame #55: <unknown function> + 0x260108 (0x55f83495b108 in /usr/bin/python)
frame #56: <unknown function> + 0x2599cb (0x55f8349549cb in /usr/bin/python)
frame #57: <unknown function> + 0x25fe55 (0x55f83495ae55 in /usr/bin/python)
frame #58: _PyRun_SimpleFileObject + 0x1a8 (0x55f83495a338 in /usr/bin/python)
frame #59: _PyRun_AnyFileObject + 0x43 (0x55f834959f83 in /usr/bin/python)
frame #60: Py_RunMain + 0x2be (0x55f83494ca5e in /usr/bin/python)
frame #61: Py_BytesMain + 0x2d (0x55f83492302d in /usr/bin/python)
frame #62: <unknown function> + 0x29d90 (0x7fb829078d90 in /usr/lib/x86_64-linux-gnu/libc.so.6)
frame #63: __libc_start_main + 0x80 (0x7fb829078e40 in /usr/lib/x86_64-linux-gnu/libc.so.6)

cc @tfogal @naoyam to assign

Kevin Stephano · Answer 1 · Sat May 18 2024 06:46:02 GMT+0800 (China Standard Time)

I don't think this is an nvFuser issue. I re-ran the snippet and it passes. What is likely happening is that the nvFuser fusion is doing a Cuda Check that catches an error downstream of the issue.