How to prevent GPU OOM?

Question

How to prevent GPU OOM?

pepa65 opened this issue a year ago · comments

I followed the readme:

export GKSwstype=100  # To avoid an occasional GR bug
git clone https://github.com/jonathan-laurent/AlphaZero.jl.git
cd AlphaZero.jl
julia --project -e 'import Pkg; Pkg.instantiate()'
julia --project -e 'using AlphaZero; Scripts.train("connect-four")'

This led to:

ERROR: Out of GPU memory trying to allocate 146.250 MiB
Effective GPU memory usage: 99.50% (3.792 GiB/3.810 GiB)
Memory pool usage: 960.548 MiB (2.688 GiB reserved)
Stacktrace:
  [1] macro expansion
    @ ~/.julia/packages/CUDA/BbliS/src/pool.jl:411 [inlined]
  [2] macro expansion
    @ ./timing.jl:382 [inlined]
  [3] #_alloc#174
    @ ~/.julia/packages/CUDA/BbliS/src/pool.jl:404 [inlined]
  [4] #alloc#173
    @ ~/.julia/packages/CUDA/BbliS/src/pool.jl:389 [inlined]
  [5] alloc
    @ ~/.julia/packages/CUDA/BbliS/src/pool.jl:383 [inlined]
  [6] CUDA.CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}(#unused#::UndefInitializer, dims::Tuple{Int64})
    @ CUDA ~/.julia/packages/CUDA/BbliS/src/array.jl:42
  [7] CuArray
    @ ~/.julia/packages/CUDA/BbliS/src/array.jl:125 [inlined]
  [8] CuArray
    @ ~/.julia/packages/CUDA/BbliS/src/array.jl:132 [inlined]
  [9] with_workspace(f::NNlibCUDA.var"#31#33"{Base.RefValue{Float32}, Base.RefValue{Float32}, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CUDNN.cudnnConvolutionBwdDataAlgoPerfStruct, CUDA.CUDNN.cudnnFilterDescriptor, CUDA.CUDNN.cudnnTensorDescriptor, CUDA.CUDNN.cudnnTensorDescriptor, CUDA.CUDNN.cudnnConvolutionDescriptor}, eltyp::Type{UInt8}, size::CUDA.APIUtils.var"#2#3"{UInt64}, fallback::Nothing; keep::Bool)
    @ CUDA.APIUtils ~/.julia/packages/CUDA/BbliS/lib/utils/call.jl:65
 [10] with_workspace(f::Function, eltyp::Type{UInt8}, size::CUDA.APIUtils.var"#2#3"{UInt64}, fallback::Nothing)
    @ CUDA.APIUtils ~/.julia/packages/CUDA/BbliS/lib/utils/call.jl:56
 [11] #with_workspace#1
    @ ~/.julia/packages/CUDA/BbliS/lib/utils/call.jl:53 [inlined]
 [12] with_workspace(f::Function, size::UInt64, fallback::Nothing) (repeats 2 times)
    @ CUDA.APIUtils ~/.julia/packages/CUDA/BbliS/lib/utils/call.jl:53
 [13] ∇conv_data!(dx::CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, dy::CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, w::CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, cdims::NNlib.DenseConvDims{2, 2, 2, 4, 2}; alpha::Int64, beta::Int64, algo::Int64)
    @ NNlibCUDA ~/.julia/packages/NNlibCUDA/gzTJY/src/cudnn/conv.jl:101
 [14] ∇conv_data!
    @ ~/.julia/packages/NNlibCUDA/gzTJY/src/cudnn/conv.jl:89 [inlined]
 [15] #∇conv_data#235
    @ ~/.julia/packages/NNlib/ydqxJ/src/conv.jl:99 [inlined]
 [16] ∇conv_data
    @ ~/.julia/packages/NNlib/ydqxJ/src/conv.jl:95 [inlined]
 [17] #374
    @ ~/.julia/packages/NNlib/ydqxJ/src/conv.jl:350 [inlined]
 [18] unthunk
    @ ~/.julia/packages/ChainRulesCore/a4mIA/src/tangent_types/thunks.jl:204 [inlined]
 [19] wrap_chainrules_output
    @ ~/.julia/packages/Zygote/g2w9o/src/compiler/chainrules.jl:110 [inlined]
 [20] map
    @ ./tuple.jl:223 [inlined]
 [21] map
    @ ./tuple.jl:224 [inlined]
 [22] wrap_chainrules_output
    @ ~/.julia/packages/Zygote/g2w9o/src/compiler/chainrules.jl:111 [inlined]
 [23] ZBack
    @ ~/.julia/packages/Zygote/g2w9o/src/compiler/chainrules.jl:211 [inlined]
 [24] Pullback
    @ ~/.julia/packages/Flux/uCLgc/src/layers/conv.jl:202 [inlined]
 [25] (::typeof(∂(λ)))(Δ::CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer})
    @ Zygote ~/.julia/packages/Zygote/g2w9o/src/compiler/interface2.jl:0
 [26] macro expansion
    @ ~/.julia/packages/Flux/uCLgc/src/layers/basic.jl:53 [inlined]
 [27] Pullback
    @ ~/.julia/packages/Flux/uCLgc/src/layers/basic.jl:53 [inlined]
 [28] (::typeof(∂(_applychain)))(Δ::CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer})
    @ Zygote ~/.julia/packages/Zygote/g2w9o/src/compiler/interface2.jl:0
 [29] Pullback
    @ ~/.julia/packages/Flux/uCLgc/src/layers/basic.jl:51 [inlined]
 [30] (::typeof(∂(λ)))(Δ::CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer})
    @ Zygote ~/.julia/packages/Zygote/g2w9o/src/compiler/interface2.jl:0
 [31] Pullback
    @ ~/.julia/packages/Flux/uCLgc/src/layers/basic.jl:359 [inlined]
 [32] (::typeof(∂(λ)))(Δ::CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer})
    @ Zygote ~/.julia/packages/Zygote/g2w9o/src/compiler/interface2.jl:0
 [33] macro expansion
    @ ~/.julia/packages/Flux/uCLgc/src/layers/basic.jl:53 [inlined]
 [34] Pullback
    @ ~/.julia/packages/Flux/uCLgc/src/layers/basic.jl:53 [inlined]
 [35] (::typeof(∂(_applychain)))(Δ::CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer})
    @ Zygote ~/.julia/packages/Zygote/g2w9o/src/compiler/interface2.jl:0
 [36] Pullback
    @ ~/.julia/packages/Flux/uCLgc/src/layers/basic.jl:51 [inlined]
--- the last 5 lines are repeated 1 more time ---
 [42] (::typeof(∂(λ)))(Δ::CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer})
    @ Zygote ~/.julia/packages/Zygote/g2w9o/src/compiler/interface2.jl:0
 [43] Pullback
    @ ~/git/AlphaZero.jl/src/networks/flux.jl:142 [inlined]
 [44] (::typeof(∂(forward)))(Δ::Tuple{CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}})
    @ Zygote ~/.julia/packages/Zygote/g2w9o/src/compiler/interface2.jl:0
 [45] Pullback
    @ ~/git/AlphaZero.jl/src/networks/network.jl:264 [inlined]
 [46] (::typeof(∂(forward_normalized)))(Δ::Tuple{CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}})
    @ Zygote ~/.julia/packages/Zygote/g2w9o/src/compiler/interface2.jl:0
 [47] Pullback
    @ ~/git/AlphaZero.jl/src/learning.jl:71 [inlined]
 [48] (::typeof(∂(losses)))(Δ::Tuple{Float32, Nothing, Nothing, Nothing, Nothing})
    @ Zygote ~/.julia/packages/Zygote/g2w9o/src/compiler/interface2.jl:0
 [49] Pullback
    @ ~/git/AlphaZero.jl/src/learning.jl:129 [inlined]
 [50] (::typeof(∂(λ)))(Δ::Float32)
    @ Zygote ~/.julia/packages/Zygote/g2w9o/src/compiler/interface2.jl:0
 [51] (::Zygote.var"#208#209"{Tuple{NamedTuple{(:W, :X, :A, :P, :V), NTuple{5, Nothing}}}, typeof(∂(λ))})(Δ::Float32)
    @ Zygote ~/.julia/packages/Zygote/g2w9o/src/lib/lib.jl:206
 [52] #2066#back
    @ ~/.julia/packages/ZygoteRules/AIbCs/src/adjoint.jl:67 [inlined]
 [53] Pullback
    @ ~/git/AlphaZero.jl/src/networks/flux.jl:81 [inlined]
 [54] (::typeof(∂(λ)))(Δ::Float32)
    @ Zygote ~/.julia/packages/Zygote/g2w9o/src/compiler/interface2.jl:0
 [55] (::Zygote.var"#99#100"{Zygote.Params{Zygote.Buffer{Any, Vector{Any}}}, typeof(∂(λ)), Zygote.Context{true}})(Δ::Float32)
    @ Zygote ~/.julia/packages/Zygote/g2w9o/src/compiler/interface.jl:389
 [56] lossgrads(f::Function, args::Zygote.Params{Zygote.Buffer{Any, Vector{Any}}})
    @ AlphaZero.FluxLib ~/git/AlphaZero.jl/src/networks/flux.jl:72
 [57] train!(callback::AlphaZero.var"#119#121"{Vector{Float32}}, nn::ResNet, opt::Adam, loss::Function, data::Base.Iterators.Take{Base.Iterators.Stateful{Base.Iterators.Flatten{Base.Generator{Base.Iterators.Repeated{Nothing}, AlphaZero.Util.var"#3#4"{Vector{NamedTuple{(:W, :X, :A, :P, :V), Tuple{CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}}}}}}, Tuple{NamedTuple{(:W, :X, :A, :P, :V), Tuple{CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}}, Tuple{Nothing, Vector{NamedTuple{(:W, :X, :A, :P, :V), Tuple{CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}}}, Int64}}}}, n::Int64)
    @ AlphaZero.FluxLib ~/git/AlphaZero.jl/src/networks/flux.jl:80
 [58] batch_updates!(tr::AlphaZero.Trainer, n::Int64)
    @ AlphaZero ~/git/AlphaZero.jl/src/learning.jl:132
 [59] macro expansion
    @ ./timing.jl:463 [inlined]
 [60] learning_step!(env::Env{AlphaZero.Examples.ConnectFour.GameSpec, ResNet, NamedTuple{(:board, :curplayer), Tuple{StaticArraysCore.SMatrix{7, 6, UInt8, 42}, UInt8}}}, handler::Session{Env{AlphaZero.Examples.ConnectFour.GameSpec, ResNet, NamedTuple{(:board, :curplayer), Tuple{StaticArraysCore.SMatrix{7, 6, UInt8, 42}, UInt8}}}})
    @ AlphaZero ~/git/AlphaZero.jl/src/training.jl:224
 [61] macro expansion
    @ ./timing.jl:463 [inlined]
 [62] macro expansion
    @ ~/git/AlphaZero.jl/src/report.jl:267 [inlined]
 [63] train!(env::Env{AlphaZero.Examples.ConnectFour.GameSpec, ResNet, NamedTuple{(:board, :curplayer), Tuple{StaticArraysCore.SMatrix{7, 6, UInt8, 42}, UInt8}}}, handler::Session{Env{AlphaZero.Examples.ConnectFour.GameSpec, ResNet, NamedTuple{(:board, :curplayer), Tuple{StaticArraysCore.SMatrix{7, 6, UInt8, 42}, UInt8}}}})
    @ AlphaZero ~/git/AlphaZero.jl/src/training.jl:327
 [64] resume!(session::Session{Env{AlphaZero.Examples.ConnectFour.GameSpec, ResNet, NamedTuple{(:board, :curplayer), Tuple{StaticArraysCore.SMatrix{7, 6, UInt8, 42}, UInt8}}}})
    @ AlphaZero.UserInterface ~/git/AlphaZero.jl/src/ui/session.jl:316
 [65] train(e::Experiment; args::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
    @ AlphaZero.Scripts ~/git/AlphaZero.jl/src/scripts/scripts.jl:26
 [66] train
    @ ~/git/AlphaZero.jl/src/scripts/scripts.jl:26 [inlined]
 [67] #train#15
    @ ~/git/AlphaZero.jl/src/scripts/scripts.jl:28 [inlined]
 [68] train(s::String)
    @ AlphaZero.Scripts ~/git/AlphaZero.jl/src/scripts/scripts.jl:28

Is there some way to tweak parameters so that I can prevent running out of memory?

Jonathan Laurent · Answer 1 · Sun Apr 23 2023 15:31:47 GMT+0800 (China Standard Time)

Yes, you probably want smaller networks (e.g. less filters, less layers) and smaller batch sizes.

pepa65 · Answer 2 · Sun Apr 23 2023 23:37:44 GMT+0800 (China Standard Time)

In ./games/connect-four/params.jl I halved NetLib.ResNetHP(num_filters) to start with. Didn't find any reference to layers, but when I halved all parameters with batch_size in it, it crashes. Even if I only modify num_filters, it doesn't run. What would be an example of a working set of parameters for smaller GPUs? (I have RTX3050.)

Stéphane Martin · Answer 3 · Thu Apr 27 2023 16:26:15 GMT+0800 (China Standard Time)

Did you have a look here?
Reducing the mem_buffer_size might work, even though it's not clear why.
If not, you could share your params.jl file.

pepa65 · Answer 4 · Thu Apr 27 2023 17:45:39 GMT+0800 (China Standard Time)

Yes, I had a look at #174 but O can't seem to make any modifications that work. Right now I have the current repo games/connect-four/params.jl except this diff:

--- a/games/connect-four/params.jl
+++ b/games/connect-four/params.jl
@@ -5,7 +5,7 @@
 Network = NetLib.ResNet
 
 netparams = NetLib.ResNetHP(
-  num_filters=128,
+  num_filters=32, #128,
   num_blocks=5,
   conv_kernel_size=(3, 3),
   num_policy_head_filters=32,
@@ -66,8 +66,9 @@ params = Params(
   use_symmetries=true,
   memory_analysis=nothing,
   mem_buffer_size=PLSchedule(
-  [      0,        15],
-  [400_000, 1_000_000]))
+#  [      0,        15],
+#  [400_000, 1_000_000]))
+  [0]. [80_000]))
 
 #####
 ##### Evaluation benchmark
@@ -93,7 +94,8 @@ benchmark_sim = SimParams(
   arena.sim;
   num_games=256,
   num_workers=256,
-  batch_size=256,
+  #batch_size=256,
+  batch_size=16,
   alternate_colors=false)
 
 benchmark = [

(It crashes...)

Jonathan Laurent · Answer 5 · Thu Apr 27 2023 18:02:09 GMT+0800 (China Standard Time)

From your stack trace, AlphaZero.jl crashes during the gradient-update phase, not during self-play. So my guess is that you should also lower your batch_size in LearningParams.

Stéphane Martin · Answer 6 · Thu Apr 27 2023 18:29:21 GMT+0800 (China Standard Time)

I'm not sure it makes any difference in Julia or if it should incur a crash during the gradient-update phase, but in your params.jl, a coma seems to have been replaced by a point in the definition of mem_buffer_size: "[0]. [80_000]".

pepa65 · Answer 7 · Thu Apr 27 2023 18:34:46 GMT+0800 (China Standard Time)

Thanks, replaced the dot with a comma (old eyes...)
When I run this, I get:

[ Info: Using the Flux implementation of AlphaZero.NetLib.

Loading environment from: sessions/connect-four

[ Info: Using modified parameters
ERROR: AssertionError: same_json(Network.hyperparams(env.bestnn), e.netparams)
Stacktrace:
 [1] Session(e::Experiment; dir::Nothing, autosave::Bool, nostdout::Bool, save_intermediate::Bool)
   @ AlphaZero.UserInterface ~/git/AlphaZero.jl/src/ui/session.jl:288
 [2] Session
   @ ~/git/AlphaZero.jl/src/ui/session.jl:273 [inlined]
 [3] train(e::Experiment; args::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
   @ AlphaZero.Scripts ~/git/AlphaZero.jl/src/scripts/scripts.jl:26
 [4] train
   @ ~/git/AlphaZero.jl/src/scripts/scripts.jl:26 [inlined]
 [5] #train#15
   @ ~/git/AlphaZero.jl/src/scripts/scripts.jl:28 [inlined]
 [6] train(s::String)
   @ AlphaZero.Scripts ~/git/AlphaZero.jl/src/scripts/scripts.jl:28
 [7] top-level scope
   @ none:1

Stéphane Martin · Answer 8 · Thu Apr 27 2023 18:39:16 GMT+0800 (China Standard Time)

Delete your sessions/connect-four folder and restart.

pepa65 · Answer 9 · Thu Apr 27 2023 18:41:24 GMT+0800 (China Standard Time)

I missed batch_size in LearningParams, quartered that. Now running with the 2 batch_sizes quartered, and with mem_buffer_size=PLSchedule([0], [80_000])). It keeps running now..! I decided also to delete sessions/connect-four, and see what happens. Thanks for all of your help so far!

Update: It's still going, now on iteration 4; won 32% on iteration 3).
Update: It finished after a few days, and is playable!