SHI-Labs / Neighborhood-Attention-Transformer

Neighborhood Attention Transformer, arxiv 2022 / CVPR 2023. Dilated Neighborhood Attention Transformer, arxiv 2022

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

some problem during train

guoguo1314 opened this issue · comments

When training with cuda,The following problems occur:

RuntimeError:
#define POS_INFINITY __int_as_float(0x7f800000)
#define INFINITY POS_INFINITY
#define NEG_INFINITY __int_as_float(0xff800000)
#define NAN __int_as_float(0x7fffffff)

typedef long long int int64_t;
typedef unsigned int uint32_t;
typedef signed char int8_t;
typedef unsigned char uint8_t; // NOTE: this MUST be "unsigned char"! "char" is equivalent to "signed char"
typedef short int16_t;
static_assert(sizeof(int64_t) == 8, "expected size does not match");
static_assert(sizeof(uint32_t) == 4, "expected size does not match");
static_assert(sizeof(int8_t) == 1, "expected size does not match");
constexpr int num_threads = 128;
constexpr int thread_work_size = 4; // TODO: make template substitution once we decide where those vars live
constexpr int block_work_size = thread_work_size * num_threads;
//TODO use _assert_fail, because assert is disabled in non-debug builds
#define ERROR_UNSUPPORTED_CAST assert(false);

namespace std {

using ::signbit;
using ::isfinite;
using ::isinf;
using ::isnan;

using ::abs;

using ::acos;
using ::acosf;
using ::asin;
using ::asinf;
using ::atan;
using ::atanf;
using ::atan2;
using ::atan2f;
using ::ceil;
using ::ceilf;
using ::cos;
using ::cosf;
using ::cosh;
using ::coshf;

using ::exp;
using ::expf;

using ::fabs;
using ::fabsf;
using ::floor;
using ::floorf;

using ::fmod;
using ::fmodf;

using ::frexp;
using ::frexpf;
using ::ldexp;
using ::ldexpf;

using ::log;
using ::logf;

using ::log10;
using ::log10f;
using ::modf;
using ::modff;

using ::pow;
using ::powf;

using ::sin;
using ::sinf;
using ::sinh;
using ::sinhf;

using ::sqrt;
using ::sqrtf;
using ::tan;
using ::tanf;

using ::tanh;
using ::tanhf;

using ::acosh;
using ::acoshf;
using ::asinh;
using ::asinhf;
using ::atanh;
using ::atanhf;
using ::cbrt;
using ::cbrtf;

using ::copysign;
using ::copysignf;

using ::erf;
using ::erff;
using ::erfc;
using ::erfcf;
using ::exp2;
using ::exp2f;
using ::expm1;
using ::expm1f;
using ::fdim;
using ::fdimf;
using ::fmaf;
using ::fma;
using ::fmax;
using ::fmaxf;
using ::fmin;
using ::fminf;
using ::hypot;
using ::hypotf;
using ::ilogb;
using ::ilogbf;
using ::lgamma;
using ::lgammaf;
using ::llrint;
using ::llrintf;
using ::llround;
using ::llroundf;
using ::log1p;
using ::log1pf;
using ::log2;
using ::log2f;
using ::logb;
using ::logbf;
using ::lrint;
using ::lrintf;
using ::lround;
using ::lroundf;

using ::nan;
using ::nanf;

using ::nearbyint;
using ::nearbyintf;
using ::nextafter;
using ::nextafterf;
using ::remainder;
using ::remainderf;
using ::remquo;
using ::remquof;
using ::rint;
using ::rintf;
using ::round;
using ::roundf;
using ::scalbln;
using ::scalblnf;
using ::scalbn;
using ::scalbnf;
using ::tgamma;
using ::tgammaf;
using ::trunc;
using ::truncf;

} // namespace std

// NB: Order matters for this macro; it is relied upon in
// promoteTypesLookup and the serialization format.
// Note, some types have ctype as void because we don't support them in codegen
#define AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(
)
_(uint8_t, Byte) /* 0 /
_(int8_t, Char) /
1 /
_(int16_t, Short) /
2 /
_(int, Int) /
3 /
_(int64_t, Long) /
4 /
_(at::Half, Half) /
5 /
_(float, Float) /
6 /
_(double, Double) /
7 /
_(std::complexat::Half, ComplexHalf) /
8 /
_(std::complex, ComplexFloat) /
9 /
_(std::complex, ComplexDouble) /
10 /
_(bool, Bool) /
11 /
_(void, QInt8) /
12 /
_(void, QUInt8) /
13 /
_(void, QInt32) /
14 /
_(at::BFloat16, BFloat16) /
15 */ \

#define AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_QINT(_)
_(uint8_t, Byte)
_(int8_t, Char)
_(int16_t, Short)
_(int, Int)
_(int64_t, Long)
_(at::Half, Half)
_(float, Float)
_(double, Double)
_(std::complexat::Half, ComplexHalf)
_(std::complex, ComplexFloat)
_(std::complex, ComplexDouble)
_(bool, Bool)
_(at::BFloat16, BFloat16)

enum class ScalarType : int8_t {
#define DEFINE_ENUM(_1, n) n,
AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_ENUM)
#undef DEFINE_ENUM
Undefined,
NumOptions
};

template <typename T, int size>
struct Array {
T data[size];

device T operator[](int i) const {
return data[i];
}
device T& operator[](int i) {
return data[i];
}
Array() = default;
Array(const Array&) = default;
Array& operator=(const Array&) = default;
device Array(T x) {
for (int i = 0; i < size; i++) {
data[i] = x;
}
}
};

template
device inline scalar_t load(char* base_ptr, uint32_t offset) {
return (reinterpret_cast<scalar_t>(base_ptr) + offset);
}

template
device inline void store(scalar_t value, char *base_ptr, uint32_t offset) {
*(reinterpret_cast<scalar_t *>(base_ptr) + offset) = value;
}

// aligned vector generates vectorized load/store on CUDA
template<typename scalar_t, int vec_size>
struct alignas(sizeof(scalar_t) * vec_size) aligned_vector {
scalar_t val[vec_size];
};

template T erfc_kernel(T a) { return erfc(a); }

// TODO: setup grid-stride loop

extern "C" global
void erfc_kernel_vectorized4_kernel(
const int N,
Array<char*, 1+1> data,
float scalar_val) //[1+1],
{
constexpr int vec_size = 4;
int remaining = N - block_work_size * blockIdx.x;
auto thread_idx = threadIdx.x;
int idx = blockIdx.x;
float arg0[4];

  float out0[4];
  

  if (remaining < block_work_size) {
    #pragma unroll
    for (int j = 0; j < thread_work_size; j++){
      if (thread_idx >= remaining) {
        break;
      }
      int linear_idx = thread_idx + block_work_size * idx;
      arg0[j] = load<float>(data[1], linear_idx);
      
      thread_idx += num_threads;
    }
    #pragma unroll
    for (int j = 0; j < thread_work_size; j++) {
      if ((threadIdx.x  + j*num_threads) < remaining) {
        out0[j] = erfc_kernel<float>(arg0[j] );
      }
    }
    thread_idx = threadIdx.x;
    #pragma unroll
    for (int j = 0; j < thread_work_size; j++) {
      if (thread_idx >= remaining) {
          break;
      }
      int linear_idx = thread_idx + block_work_size * idx;
      store<float>(out0[j], data[0], linear_idx);
      
      thread_idx += num_threads;
    }
  } else {
    static constexpr int loop_size = thread_work_size / vec_size;

//actual loading
using vec_t_input = aligned_vector<float, vec_size>;
vec_t_input * vec0 = reinterpret_cast<vec_t_input *>(data[0+1]) + block_work_size / vec_size * idx;

    #pragma unroll
    for (int i = 0; i<loop_size; i++){
      vec_t_input v;
      v = vec0[thread_idx];
      #pragma unroll
      for (int j=0; j < vec_size; j++){
        arg0[vec_size * i + j] = v.val[j];
      }
      
      thread_idx += num_threads;
    }

    #pragma unroll
    for (int j = 0; j < thread_work_size; j++) {
      out0[j] = erfc_kernel<float>(arg0[j] );
    }

    using vec_t_output = aligned_vector<float, vec_size>;
    vec_t_output* to_0 = reinterpret_cast<vec_t_output*>(data[0]) + block_work_size / vec_size * idx;
    
    int thread_idx = threadIdx.x;
    #pragma unroll
    for (int i = 0; i<loop_size; i++){
      vec_t_output v;
      #pragma unroll
      for (int j=0; j<vec_size; j++){
      v.val[j] = out0[vec_size * i + j];
      }
      to_0[thread_idx] = v;
      
      thread_idx += num_threads;
    }
  }

}
nvrtc: error: invalid value for --gpu-architecture (-arch)

When training with cpu is normal

The installed versions are as follows:
image

thanks a million!!!

Hello and thank you for your interest.

Could you please share which version of NATTEN you're on, and confirm you are not adding any external modules to the code?

Because the issue does not appear to be anything related to this repository specifically.

Thank you very much!
natten uses 0.14.6 (other packages are shown in the screenshot above);
Yes, I just integrated NATLayer into the model in our direction, but it could run normally before it was integrated. After it was integrated, this mistake was made by training with 4090ti.

As far as I can tell the issue is that part of the code you're attempting to run tries to just-in-time compile something, and the arguments there are incorrect. That's why you're getting:

nvrtc: error: invalid value for --gpu-architecture (-arch)

Could you check whether a forward and backward pass into NATLayer works separately?

# from .nat import NATLayer
import torch

nat_layer = NATLayer(dim=32, num_heads=4, kernel_size=7, dilation=1).cuda()
x = torch.randn((2, 14, 14, 128)).cuda()

x_out = nat_layer(x)
x_out_sum = x_out.sum()

x_out_sum.backward()

@alihassanijr is the natten version correct? He has 0.14.6+torch1120cu113 but cuda version is 11.7 (mine end in torch1130cu117` for pytorch version 11.3). I just want to verify because the simple solution could be reinstalling pytorch and ensuring the versions are correct.

this mistake was made by training with 4090ti.

Can you also verify this? The 4090Ti is not publicly released.

As far as I can tell the issue is that part of the code you're attempting to run tries to just-in-time compile something, and the arguments there are incorrect. That's why you're getting:

nvrtc: error: invalid value for --gpu-architecture (-arch)

Could you check whether a forward and backward pass into NATLayer works separately?

# from .nat import NATLayer
import torch

nat_layer = NATLayer(dim=32, num_heads=4, kernel_size=7, dilation=1).cuda()
x = torch.randn((2, 14, 14, 128)).cuda()

x_out = nat_layer(x)
x_out_sum = x_out.sum()

x_out_sum.backward()

Ok, I can try,thanks!

@alihassanijr is the natten version correct? He has 0.14.6+torch1120cu113 but cuda version is 11.7 (mine end in torch1130cu117` for pytorch version 11.3). I just want to verify because the simple solution could be reinstalling pytorch and ensuring the versions are correct.

this mistake was made by training with 4090ti.

Can you also verify this? The 4090Ti is not publicly released.

I said it wrong. It's 4090
As for the package issue, I am downloading a specific package from our direction, which will automatically download torch, and cuda packages such as Nvidia-Cuda-Cuti-Cu11, so I cannot specify torch version.
but I wanted to use natten again, cuda version I installed is 11.3, and cuda11.3 supports torch11.2.0 by checking the table on pytorch official website. so I downloaded torch1.12.0 to override torch, which was downloading automatically. Maybe there was a problem here, but I didn't find a good way.
Oh, that comes to mind, I can replace the cuda11.3 packs with the auto-download cuda11.7.
I am running other code these days, and I can try this method a few days later.
Thanks.

Just to be clear, there is a difference between the CUDA driver/compiler version you have installed, and the one you'd see in your PyTorch version. The cuda version suffix in your PyTorch version (if you installed via pip) indicates which version of CUDA was used to compile the PyTorch binaries you're downloading.

Your local CUDA compiler can be on a different version compared to your PyTorch (though not usually a good idea), but as long as the version of NATTEN you install exactly matches your torch AND the version of CUDA it was compiled with, everything should work out.

But again, to be very clear, the error you shared indicates that something in your code is trying to compile CUDA code, and it fails because of an argument mismatch with your compiler, which is probably because of the mismatch between your CUDA version and PyTorch build. However, neither of those are related to NATTEN, since you already have it installed.
Therefore, the issue here is completely unrelated to this project.

ok, i see, thank you! ! !

I'm closing the issue now, but feel free to reopen if you still have questions related to NA/NATTEN.