How to prevent GPU OOM?
pepa65 opened this issue · comments
I followed the readme:
export GKSwstype=100 # To avoid an occasional GR bug
git clone https://github.com/jonathan-laurent/AlphaZero.jl.git
cd AlphaZero.jl
julia --project -e 'import Pkg; Pkg.instantiate()'
julia --project -e 'using AlphaZero; Scripts.train("connect-four")'
This led to:
ERROR: Out of GPU memory trying to allocate 146.250 MiB
Effective GPU memory usage: 99.50% (3.792 GiB/3.810 GiB)
Memory pool usage: 960.548 MiB (2.688 GiB reserved)
Stacktrace:
[1] macro expansion
@ ~/.julia/packages/CUDA/BbliS/src/pool.jl:411 [inlined]
[2] macro expansion
@ ./timing.jl:382 [inlined]
[3] #_alloc#174
@ ~/.julia/packages/CUDA/BbliS/src/pool.jl:404 [inlined]
[4] #alloc#173
@ ~/.julia/packages/CUDA/BbliS/src/pool.jl:389 [inlined]
[5] alloc
@ ~/.julia/packages/CUDA/BbliS/src/pool.jl:383 [inlined]
[6] CUDA.CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}(#unused#::UndefInitializer, dims::Tuple{Int64})
@ CUDA ~/.julia/packages/CUDA/BbliS/src/array.jl:42
[7] CuArray
@ ~/.julia/packages/CUDA/BbliS/src/array.jl:125 [inlined]
[8] CuArray
@ ~/.julia/packages/CUDA/BbliS/src/array.jl:132 [inlined]
[9] with_workspace(f::NNlibCUDA.var"#31#33"{Base.RefValue{Float32}, Base.RefValue{Float32}, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CUDNN.cudnnConvolutionBwdDataAlgoPerfStruct, CUDA.CUDNN.cudnnFilterDescriptor, CUDA.CUDNN.cudnnTensorDescriptor, CUDA.CUDNN.cudnnTensorDescriptor, CUDA.CUDNN.cudnnConvolutionDescriptor}, eltyp::Type{UInt8}, size::CUDA.APIUtils.var"#2#3"{UInt64}, fallback::Nothing; keep::Bool)
@ CUDA.APIUtils ~/.julia/packages/CUDA/BbliS/lib/utils/call.jl:65
[10] with_workspace(f::Function, eltyp::Type{UInt8}, size::CUDA.APIUtils.var"#2#3"{UInt64}, fallback::Nothing)
@ CUDA.APIUtils ~/.julia/packages/CUDA/BbliS/lib/utils/call.jl:56
[11] #with_workspace#1
@ ~/.julia/packages/CUDA/BbliS/lib/utils/call.jl:53 [inlined]
[12] with_workspace(f::Function, size::UInt64, fallback::Nothing) (repeats 2 times)
@ CUDA.APIUtils ~/.julia/packages/CUDA/BbliS/lib/utils/call.jl:53
[13] ∇conv_data!(dx::CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, dy::CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, w::CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, cdims::NNlib.DenseConvDims{2, 2, 2, 4, 2}; alpha::Int64, beta::Int64, algo::Int64)
@ NNlibCUDA ~/.julia/packages/NNlibCUDA/gzTJY/src/cudnn/conv.jl:101
[14] ∇conv_data!
@ ~/.julia/packages/NNlibCUDA/gzTJY/src/cudnn/conv.jl:89 [inlined]
[15] #∇conv_data#235
@ ~/.julia/packages/NNlib/ydqxJ/src/conv.jl:99 [inlined]
[16] ∇conv_data
@ ~/.julia/packages/NNlib/ydqxJ/src/conv.jl:95 [inlined]
[17] #374
@ ~/.julia/packages/NNlib/ydqxJ/src/conv.jl:350 [inlined]
[18] unthunk
@ ~/.julia/packages/ChainRulesCore/a4mIA/src/tangent_types/thunks.jl:204 [inlined]
[19] wrap_chainrules_output
@ ~/.julia/packages/Zygote/g2w9o/src/compiler/chainrules.jl:110 [inlined]
[20] map
@ ./tuple.jl:223 [inlined]
[21] map
@ ./tuple.jl:224 [inlined]
[22] wrap_chainrules_output
@ ~/.julia/packages/Zygote/g2w9o/src/compiler/chainrules.jl:111 [inlined]
[23] ZBack
@ ~/.julia/packages/Zygote/g2w9o/src/compiler/chainrules.jl:211 [inlined]
[24] Pullback
@ ~/.julia/packages/Flux/uCLgc/src/layers/conv.jl:202 [inlined]
[25] (::typeof(∂(λ)))(Δ::CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer})
@ Zygote ~/.julia/packages/Zygote/g2w9o/src/compiler/interface2.jl:0
[26] macro expansion
@ ~/.julia/packages/Flux/uCLgc/src/layers/basic.jl:53 [inlined]
[27] Pullback
@ ~/.julia/packages/Flux/uCLgc/src/layers/basic.jl:53 [inlined]
[28] (::typeof(∂(_applychain)))(Δ::CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer})
@ Zygote ~/.julia/packages/Zygote/g2w9o/src/compiler/interface2.jl:0
[29] Pullback
@ ~/.julia/packages/Flux/uCLgc/src/layers/basic.jl:51 [inlined]
[30] (::typeof(∂(λ)))(Δ::CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer})
@ Zygote ~/.julia/packages/Zygote/g2w9o/src/compiler/interface2.jl:0
[31] Pullback
@ ~/.julia/packages/Flux/uCLgc/src/layers/basic.jl:359 [inlined]
[32] (::typeof(∂(λ)))(Δ::CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer})
@ Zygote ~/.julia/packages/Zygote/g2w9o/src/compiler/interface2.jl:0
[33] macro expansion
@ ~/.julia/packages/Flux/uCLgc/src/layers/basic.jl:53 [inlined]
[34] Pullback
@ ~/.julia/packages/Flux/uCLgc/src/layers/basic.jl:53 [inlined]
[35] (::typeof(∂(_applychain)))(Δ::CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer})
@ Zygote ~/.julia/packages/Zygote/g2w9o/src/compiler/interface2.jl:0
[36] Pullback
@ ~/.julia/packages/Flux/uCLgc/src/layers/basic.jl:51 [inlined]
--- the last 5 lines are repeated 1 more time ---
[42] (::typeof(∂(λ)))(Δ::CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer})
@ Zygote ~/.julia/packages/Zygote/g2w9o/src/compiler/interface2.jl:0
[43] Pullback
@ ~/git/AlphaZero.jl/src/networks/flux.jl:142 [inlined]
[44] (::typeof(∂(forward)))(Δ::Tuple{CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}})
@ Zygote ~/.julia/packages/Zygote/g2w9o/src/compiler/interface2.jl:0
[45] Pullback
@ ~/git/AlphaZero.jl/src/networks/network.jl:264 [inlined]
[46] (::typeof(∂(forward_normalized)))(Δ::Tuple{CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}})
@ Zygote ~/.julia/packages/Zygote/g2w9o/src/compiler/interface2.jl:0
[47] Pullback
@ ~/git/AlphaZero.jl/src/learning.jl:71 [inlined]
[48] (::typeof(∂(losses)))(Δ::Tuple{Float32, Nothing, Nothing, Nothing, Nothing})
@ Zygote ~/.julia/packages/Zygote/g2w9o/src/compiler/interface2.jl:0
[49] Pullback
@ ~/git/AlphaZero.jl/src/learning.jl:129 [inlined]
[50] (::typeof(∂(λ)))(Δ::Float32)
@ Zygote ~/.julia/packages/Zygote/g2w9o/src/compiler/interface2.jl:0
[51] (::Zygote.var"#208#209"{Tuple{NamedTuple{(:W, :X, :A, :P, :V), NTuple{5, Nothing}}}, typeof(∂(λ))})(Δ::Float32)
@ Zygote ~/.julia/packages/Zygote/g2w9o/src/lib/lib.jl:206
[52] #2066#back
@ ~/.julia/packages/ZygoteRules/AIbCs/src/adjoint.jl:67 [inlined]
[53] Pullback
@ ~/git/AlphaZero.jl/src/networks/flux.jl:81 [inlined]
[54] (::typeof(∂(λ)))(Δ::Float32)
@ Zygote ~/.julia/packages/Zygote/g2w9o/src/compiler/interface2.jl:0
[55] (::Zygote.var"#99#100"{Zygote.Params{Zygote.Buffer{Any, Vector{Any}}}, typeof(∂(λ)), Zygote.Context{true}})(Δ::Float32)
@ Zygote ~/.julia/packages/Zygote/g2w9o/src/compiler/interface.jl:389
[56] lossgrads(f::Function, args::Zygote.Params{Zygote.Buffer{Any, Vector{Any}}})
@ AlphaZero.FluxLib ~/git/AlphaZero.jl/src/networks/flux.jl:72
[57] train!(callback::AlphaZero.var"#119#121"{Vector{Float32}}, nn::ResNet, opt::Adam, loss::Function, data::Base.Iterators.Take{Base.Iterators.Stateful{Base.Iterators.Flatten{Base.Generator{Base.Iterators.Repeated{Nothing}, AlphaZero.Util.var"#3#4"{Vector{NamedTuple{(:W, :X, :A, :P, :V), Tuple{CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}}}}}}, Tuple{NamedTuple{(:W, :X, :A, :P, :V), Tuple{CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}}, Tuple{Nothing, Vector{NamedTuple{(:W, :X, :A, :P, :V), Tuple{CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}}}, Int64}}}}, n::Int64)
@ AlphaZero.FluxLib ~/git/AlphaZero.jl/src/networks/flux.jl:80
[58] batch_updates!(tr::AlphaZero.Trainer, n::Int64)
@ AlphaZero ~/git/AlphaZero.jl/src/learning.jl:132
[59] macro expansion
@ ./timing.jl:463 [inlined]
[60] learning_step!(env::Env{AlphaZero.Examples.ConnectFour.GameSpec, ResNet, NamedTuple{(:board, :curplayer), Tuple{StaticArraysCore.SMatrix{7, 6, UInt8, 42}, UInt8}}}, handler::Session{Env{AlphaZero.Examples.ConnectFour.GameSpec, ResNet, NamedTuple{(:board, :curplayer), Tuple{StaticArraysCore.SMatrix{7, 6, UInt8, 42}, UInt8}}}})
@ AlphaZero ~/git/AlphaZero.jl/src/training.jl:224
[61] macro expansion
@ ./timing.jl:463 [inlined]
[62] macro expansion
@ ~/git/AlphaZero.jl/src/report.jl:267 [inlined]
[63] train!(env::Env{AlphaZero.Examples.ConnectFour.GameSpec, ResNet, NamedTuple{(:board, :curplayer), Tuple{StaticArraysCore.SMatrix{7, 6, UInt8, 42}, UInt8}}}, handler::Session{Env{AlphaZero.Examples.ConnectFour.GameSpec, ResNet, NamedTuple{(:board, :curplayer), Tuple{StaticArraysCore.SMatrix{7, 6, UInt8, 42}, UInt8}}}})
@ AlphaZero ~/git/AlphaZero.jl/src/training.jl:327
[64] resume!(session::Session{Env{AlphaZero.Examples.ConnectFour.GameSpec, ResNet, NamedTuple{(:board, :curplayer), Tuple{StaticArraysCore.SMatrix{7, 6, UInt8, 42}, UInt8}}}})
@ AlphaZero.UserInterface ~/git/AlphaZero.jl/src/ui/session.jl:316
[65] train(e::Experiment; args::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
@ AlphaZero.Scripts ~/git/AlphaZero.jl/src/scripts/scripts.jl:26
[66] train
@ ~/git/AlphaZero.jl/src/scripts/scripts.jl:26 [inlined]
[67] #train#15
@ ~/git/AlphaZero.jl/src/scripts/scripts.jl:28 [inlined]
[68] train(s::String)
@ AlphaZero.Scripts ~/git/AlphaZero.jl/src/scripts/scripts.jl:28
Is there some way to tweak parameters so that I can prevent running out of memory?
Yes, you probably want smaller networks (e.g. less filters, less layers) and smaller batch sizes.
In ./games/connect-four/params.jl
I halved NetLib.ResNetHP(num_filters)
to start with. Didn't find any reference to layers, but when I halved all parameters with batch_size
in it, it crashes. Even if I only modify num_filters
, it doesn't run. What would be an example of a working set of parameters for smaller GPUs? (I have RTX3050.)
Did you have a look here?
Reducing the mem_buffer_size
might work, even though it's not clear why.
If not, you could share your params.jl
file.
Yes, I had a look at #174 but O can't seem to make any modifications that work. Right now I have the current repo games/connect-four/params.jl
except this diff:
--- a/games/connect-four/params.jl
+++ b/games/connect-four/params.jl
@@ -5,7 +5,7 @@
Network = NetLib.ResNet
netparams = NetLib.ResNetHP(
- num_filters=128,
+ num_filters=32, #128,
num_blocks=5,
conv_kernel_size=(3, 3),
num_policy_head_filters=32,
@@ -66,8 +66,9 @@ params = Params(
use_symmetries=true,
memory_analysis=nothing,
mem_buffer_size=PLSchedule(
- [ 0, 15],
- [400_000, 1_000_000]))
+# [ 0, 15],
+# [400_000, 1_000_000]))
+ [0]. [80_000]))
#####
##### Evaluation benchmark
@@ -93,7 +94,8 @@ benchmark_sim = SimParams(
arena.sim;
num_games=256,
num_workers=256,
- batch_size=256,
+ #batch_size=256,
+ batch_size=16,
alternate_colors=false)
benchmark = [
(It crashes...)
From your stack trace, AlphaZero.jl crashes during the gradient-update phase, not during self-play. So my guess is that you should also lower your batch_size
in LearningParams
.
I'm not sure it makes any difference in Julia or if it should incur a crash during the gradient-update phase, but in your params.jl
, a coma seems to have been replaced by a point in the definition of mem_buffer_size
: "[0]. [80_000]
".
Thanks, replaced the dot with a comma (old eyes...)
When I run this, I get:
[ Info: Using the Flux implementation of AlphaZero.NetLib.
Loading environment from: sessions/connect-four
[ Info: Using modified parameters
ERROR: AssertionError: same_json(Network.hyperparams(env.bestnn), e.netparams)
Stacktrace:
[1] Session(e::Experiment; dir::Nothing, autosave::Bool, nostdout::Bool, save_intermediate::Bool)
@ AlphaZero.UserInterface ~/git/AlphaZero.jl/src/ui/session.jl:288
[2] Session
@ ~/git/AlphaZero.jl/src/ui/session.jl:273 [inlined]
[3] train(e::Experiment; args::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
@ AlphaZero.Scripts ~/git/AlphaZero.jl/src/scripts/scripts.jl:26
[4] train
@ ~/git/AlphaZero.jl/src/scripts/scripts.jl:26 [inlined]
[5] #train#15
@ ~/git/AlphaZero.jl/src/scripts/scripts.jl:28 [inlined]
[6] train(s::String)
@ AlphaZero.Scripts ~/git/AlphaZero.jl/src/scripts/scripts.jl:28
[7] top-level scope
@ none:1
Delete your sessions/connect-four
folder and restart.
I missed batch_size
in LearningParams
, quartered that. Now running with the 2 batch_size
s quartered, and with mem_buffer_size=PLSchedule([0], [80_000]))
. It keeps running now..! I decided also to delete sessions/connect-four
, and see what happens. Thanks for all of your help so far!
Update: It's still going, now on iteration 4; won 32% on iteration 3).
Update: It finished after a few days, and is playable!