JuliaGPU / CUDA.jl

CUDA programming in Julia.

Home Page:https://juliagpu.org/cuda/

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

Kernel using `StaticArray` compiles in julia v1.9.4 but not in v1.10.2

lcw opened this issue · comments

Describe the bug

The kernel

function demo(A)
    B = MVector{3,SVector{1,Float32}}(undef)

    # If `@inbounds` is added to this loop the code compiles
    for o = 1:3
        B[o] = A[o] * SVector{1,Float32}(1.0f0)
    end

    return nothing
end

compiles and runs with julia v1.9.4 but fails to compile with julia v1.10.2 with the error:

ERROR: InvalidIRError: compiling MethodInstance for demo(::CuDeviceVector{Float32, 1}) resulted in invalid LLVM IR
Reason: unsupported call to an unknown function (call to julia.new_gc_frame)
Reason: unsupported call to an unknown function (call to julia.push_gc_frame)
Reason: unsupported call to an unknown function (call to julia.get_gc_frame_slot)
Reason: unsupported call to an unknown function (call to julia.pop_gc_frame)
Hint: catch this exception as `err` and call `code_typed(err; interactive = true)` to introspect the erronous code with Cthulhu.jl
Stacktrace:
  [1] check_ir(job::GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, args::LLVM.Module)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/U36Ed/src/validation.jl:147
  [2] macro expansion
    @ ~/.julia/packages/GPUCompiler/U36Ed/src/driver.jl:440 [inlined]
  [3] macro expansion
    @ ~/.julia/packages/TimerOutputs/RsWnF/src/TimerOutput.jl:253 [inlined]
  [4] macro expansion
    @ ~/.julia/packages/GPUCompiler/U36Ed/src/driver.jl:439 [inlined]
  [5] 
    @ GPUCompiler ~/.julia/packages/GPUCompiler/U36Ed/src/utils.jl:92
  [6] emit_llvm
    @ ~/.julia/packages/GPUCompiler/U36Ed/src/utils.jl:86 [inlined]
  [7] 
    @ GPUCompiler ~/.julia/packages/GPUCompiler/U36Ed/src/driver.jl:129
  [8] codegen
    @ ~/.julia/packages/GPUCompiler/U36Ed/src/driver.jl:110 [inlined]
  [9] 
    @ GPUCompiler ~/.julia/packages/GPUCompiler/U36Ed/src/driver.jl:106
 [10] compile
    @ ~/.julia/packages/GPUCompiler/U36Ed/src/driver.jl:98 [inlined]
 [11] #1072
    @ ~/.julia/packages/CUDA/htRwP/src/compiler/compilation.jl:247 [inlined]
 [12] JuliaContext(f::CUDA.var"#1072#1075"{GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}})
    @ GPUCompiler ~/.julia/packages/GPUCompiler/U36Ed/src/driver.jl:47
 [13] compile(job::GPUCompiler.CompilerJob)
    @ CUDA ~/.julia/packages/CUDA/htRwP/src/compiler/compilation.jl:246
 [14] actual_compilation(cache::Dict{…}, src::Core.MethodInstance, world::UInt64, cfg::GPUCompiler.CompilerConfig{…}, compiler::typeof(CUDA.compile), linker::typeof(CUDA.link))
    @ GPUCompiler ~/.julia/packages/GPUCompiler/U36Ed/src/execution.jl:125
 [15] cached_compilation(cache::Dict{…}, src::Core.MethodInstance, cfg::GPUCompiler.CompilerConfig{…}, compiler::Function, linker::Function)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/U36Ed/src/execution.jl:103
 [16] macro expansion
    @ ~/.julia/packages/CUDA/htRwP/src/compiler/execution.jl:367 [inlined]
 [17] macro expansion
    @ ./lock.jl:267 [inlined]
 [18] cufunction(f::typeof(demo), tt::Type{Tuple{CuDeviceVector{Float32, 1}}}; kwargs::@Kwargs{})
    @ CUDA ~/.julia/packages/CUDA/htRwP/src/compiler/execution.jl:362
 [19] cufunction(f::typeof(demo), tt::Type{Tuple{CuDeviceVector{Float32, 1}}})
    @ CUDA ~/.julia/packages/CUDA/htRwP/src/compiler/execution.jl:359
 [20] macro expansion
    @ ~/.julia/packages/CUDA/htRwP/src/compiler/execution.jl:112 [inlined]
 [21] top-level scope
    @ ~/.julia/packages/GPUCompiler/U36Ed/src/reflection.jl:206
 [22] top-level scope
    @ ~/.julia/packages/CUDA/htRwP/src/initialization.jl:206
Some type information was truncated. Use `show(err)` to see complete types.


To reproduce

The Minimal Working Example (MWE) for this bug:

using CUDA
using StaticArrays

A = CUDA.zeros(Float32, 3)

function demo(A)
    B = MVector{3,SVector{1,Float32}}(undef)

    # If `@inbounds` is added to this loop the code compiles
    for o = 1:3
        B[o] = A[o] * SVector{1,Float32}(1.0f0)
    end

    return nothing
end

@cuda demo(A)
Manifest.toml

# This file is machine-generated - editing it directly is not advised

julia_version = "1.10.2"
manifest_format = "2.0"
project_hash = "ea66756cd36b9071e136b2612971efcfb913d90c"

[[deps.AbstractFFTs]]
deps = ["LinearAlgebra"]
git-tree-sha1 = "d92ad398961a3ed262d8bf04a1a2b8340f915fef"
uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c"
version = "1.5.0"

    [deps.AbstractFFTs.extensions]
    AbstractFFTsChainRulesCoreExt = "ChainRulesCore"
    AbstractFFTsTestExt = "Test"

    [deps.AbstractFFTs.weakdeps]
    ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
    Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[[deps.Adapt]]
deps = ["LinearAlgebra", "Requires"]
git-tree-sha1 = "6a55b747d1812e699320963ffde36f1ebdda4099"
uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
version = "4.0.4"
weakdeps = ["StaticArrays"]

    [deps.Adapt.extensions]
    AdaptStaticArraysExt = "StaticArrays"

[[deps.ArgTools]]
uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f"
version = "1.1.1"

[[deps.Artifacts]]
uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"

[[deps.Atomix]]
deps = ["UnsafeAtomics"]
git-tree-sha1 = "c06a868224ecba914baa6942988e2f2aade419be"
uuid = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
version = "0.1.0"

[[deps.BFloat16s]]
deps = ["LinearAlgebra", "Printf", "Random", "Test"]
git-tree-sha1 = "dbf84058d0a8cbbadee18d25cf606934b22d7c66"
uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b"
version = "0.4.2"

[[deps.Base64]]
uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"

[[deps.CEnum]]
git-tree-sha1 = "389ad5c84de1ae7cf0e28e381131c98ea87d54fc"
uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82"
version = "0.5.0"

[[deps.CUDA]]
deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CUDA_Driver_jll", "CUDA_Runtime_Discovery", "CUDA_Runtime_jll", "Crayons", "DataFrames", "ExprTools", "GPUArrays", "GPUCompiler", "KernelAbstractions", "LLVM", "LLVMLoopInfo", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "NVTX", "Preferences", "PrettyTables", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "StaticArrays", "Statistics"]
git-tree-sha1 = "baa8ea7a1ea63316fa3feb454635215773c9c845"
uuid = "052768ef-5323-5732-b1bb-66c8b64840ba"
version = "5.2.0"

    [deps.CUDA.extensions]
    ChainRulesCoreExt = "ChainRulesCore"
    SpecialFunctionsExt = "SpecialFunctions"

    [deps.CUDA.weakdeps]
    ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
    SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"

[[deps.CUDA_Driver_jll]]
deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "Pkg"]
git-tree-sha1 = "d01bfc999768f0a31ed36f5d22a76161fc63079c"
uuid = "4ee394cb-3365-5eb0-8335-949819d2adfc"
version = "0.7.0+1"

[[deps.CUDA_Runtime_Discovery]]
deps = ["Libdl"]
git-tree-sha1 = "2cb12f6b2209f40a4b8967697689a47c50485490"
uuid = "1af6417a-86b4-443c-805f-a4643ffb695f"
version = "0.2.3"

[[deps.CUDA_Runtime_jll]]
deps = ["Artifacts", "CUDA_Driver_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"]
git-tree-sha1 = "8e25c009d2bf16c2c31a70a6e9e8939f7325cc84"
uuid = "76a88914-d11a-5bdc-97e0-2f5a05c973a2"
version = "0.11.1+0"

[[deps.ColorTypes]]
deps = ["FixedPointNumbers", "Random"]
git-tree-sha1 = "eb7f0f8307f71fac7c606984ea5fb2817275d6e4"
uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f"
version = "0.11.4"

[[deps.Colors]]
deps = ["ColorTypes", "FixedPointNumbers", "Reexport"]
git-tree-sha1 = "fc08e5930ee9a4e03f84bfb5211cb54e7769758a"
uuid = "5ae59095-9a9b-59fe-a467-6f913c188581"
version = "0.12.10"

[[deps.Compat]]
deps = ["TOML", "UUIDs"]
git-tree-sha1 = "c955881e3c981181362ae4088b35995446298b80"
uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
version = "4.14.0"
weakdeps = ["Dates", "LinearAlgebra"]

    [deps.Compat.extensions]
    CompatLinearAlgebraExt = "LinearAlgebra"

[[deps.CompilerSupportLibraries_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
version = "1.1.0+0"

[[deps.Crayons]]
git-tree-sha1 = "249fe38abf76d48563e2f4556bebd215aa317e15"
uuid = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f"
version = "4.1.1"

[[deps.DataAPI]]
git-tree-sha1 = "abe83f3a2f1b857aac70ef8b269080af17764bbe"
uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
version = "1.16.0"

[[deps.DataFrames]]
deps = ["Compat", "DataAPI", "DataStructures", "Future", "InlineStrings", "InvertedIndices", "IteratorInterfaceExtensions", "LinearAlgebra", "Markdown", "Missings", "PooledArrays", "PrecompileTools", "PrettyTables", "Printf", "REPL", "Random", "Reexport", "SentinelArrays", "SortingAlgorithms", "Statistics", "TableTraits", "Tables", "Unicode"]
git-tree-sha1 = "04c738083f29f86e62c8afc341f0967d8717bdb8"
uuid = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
version = "1.6.1"

[[deps.DataStructures]]
deps = ["Compat", "InteractiveUtils", "OrderedCollections"]
git-tree-sha1 = "0f4b5d62a88d8f59003e43c25a8a90de9eb76317"
uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
version = "0.18.18"

[[deps.DataValueInterfaces]]
git-tree-sha1 = "bfc1187b79289637fa0ef6d4436ebdfe6905cbd6"
uuid = "e2d170a0-9d28-54be-80f0-106bbe20a464"
version = "1.0.0"

[[deps.Dates]]
deps = ["Printf"]
uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"

[[deps.Downloads]]
deps = ["ArgTools", "FileWatching", "LibCURL", "NetworkOptions"]
uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
version = "1.6.0"

[[deps.ExprTools]]
git-tree-sha1 = "27415f162e6028e81c72b82ef756bf321213b6ec"
uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
version = "0.1.10"

[[deps.FileWatching]]
uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee"

[[deps.FixedPointNumbers]]
deps = ["Statistics"]
git-tree-sha1 = "335bfdceacc84c5cdf16aadc768aa5ddfc5383cc"
uuid = "53c48c17-4a7d-5ca2-90c5-79b7896eea93"
version = "0.8.4"

[[deps.Future]]
deps = ["Random"]
uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820"

[[deps.GPUArrays]]
deps = ["Adapt", "GPUArraysCore", "LLVM", "LinearAlgebra", "Printf", "Random", "Reexport", "Serialization", "Statistics"]
git-tree-sha1 = "47e4686ec18a9620850bad110b79966132f14283"
uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
version = "10.0.2"

[[deps.GPUArraysCore]]
deps = ["Adapt"]
git-tree-sha1 = "ec632f177c0d990e64d955ccc1b8c04c485a0950"
uuid = "46192b85-c4d5-4398-a991-12ede77f4527"
version = "0.1.6"

[[deps.GPUCompiler]]
deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Scratch", "TimerOutputs", "UUIDs"]
git-tree-sha1 = "a846f297ce9d09ccba02ead0cae70690e072a119"
uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
version = "0.25.0"

[[deps.InlineStrings]]
deps = ["Parsers"]
git-tree-sha1 = "9cc2baf75c6d09f9da536ddf58eb2f29dedaf461"
uuid = "842dd82b-1e85-43dc-bf29-5d0ee9dffc48"
version = "1.4.0"

[[deps.InteractiveUtils]]
deps = ["Markdown"]
uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"

[[deps.InvertedIndices]]
git-tree-sha1 = "0dc7b50b8d436461be01300fd8cd45aa0274b038"
uuid = "41ab1584-1d38-5bbf-9106-f11c6c58b48f"
version = "1.3.0"

[[deps.IteratorInterfaceExtensions]]
git-tree-sha1 = "a3f24677c21f5bbe9d2a714f95dcd58337fb2856"
uuid = "82899510-4779-5014-852e-03e436cf321d"
version = "1.0.0"

[[deps.JLLWrappers]]
deps = ["Artifacts", "Preferences"]
git-tree-sha1 = "7e5d6779a1e09a36db2a7b6cff50942a0a7d0fca"
uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210"
version = "1.5.0"

[[deps.JuliaNVTXCallbacks_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "af433a10f3942e882d3c671aacb203e006a5808f"
uuid = "9c1d0b0a-7046-5b2e-a33f-ea22f176ac7e"
version = "0.2.1+0"

[[deps.KernelAbstractions]]
deps = ["Adapt", "Atomix", "InteractiveUtils", "LinearAlgebra", "MacroTools", "PrecompileTools", "Requires", "SparseArrays", "StaticArrays", "UUIDs", "UnsafeAtomics", "UnsafeAtomicsLLVM"]
git-tree-sha1 = "ed7167240f40e62d97c1f5f7735dea6de3cc5c49"
uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
version = "0.9.18"

    [deps.KernelAbstractions.extensions]
    EnzymeExt = "EnzymeCore"

    [deps.KernelAbstractions.weakdeps]
    EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"

[[deps.LLVM]]
deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Preferences", "Printf", "Requires", "Unicode"]
git-tree-sha1 = "ab01dde107f21aa76144d0771dccc08f152ccac7"
uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
version = "6.6.2"
weakdeps = ["BFloat16s"]

    [deps.LLVM.extensions]
    BFloat16sExt = "BFloat16s"

[[deps.LLVMExtra_jll]]
deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"]
git-tree-sha1 = "88b916503aac4fb7f701bb625cd84ca5dd1677bc"
uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab"
version = "0.0.29+0"

[[deps.LLVMLoopInfo]]
git-tree-sha1 = "2e5c102cfc41f48ae4740c7eca7743cc7e7b75ea"
uuid = "8b046642-f1f6-4319-8d3c-209ddc03c586"
version = "1.0.0"

[[deps.LaTeXStrings]]
git-tree-sha1 = "50901ebc375ed41dbf8058da26f9de442febbbec"
uuid = "b964fa9f-0449-5b57-a5c2-d3ea65f4040f"
version = "1.3.1"

[[deps.LazyArtifacts]]
deps = ["Artifacts", "Pkg"]
uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3"

[[deps.LibCURL]]
deps = ["LibCURL_jll", "MozillaCACerts_jll"]
uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21"
version = "0.6.4"

[[deps.LibCURL_jll]]
deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"]
uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0"
version = "8.4.0+0"

[[deps.LibGit2]]
deps = ["Base64", "LibGit2_jll", "NetworkOptions", "Printf", "SHA"]
uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"

[[deps.LibGit2_jll]]
deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll"]
uuid = "e37daf67-58a4-590a-8e99-b0245dd2ffc5"
version = "1.6.4+0"

[[deps.LibSSH2_jll]]
deps = ["Artifacts", "Libdl", "MbedTLS_jll"]
uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8"
version = "1.11.0+1"

[[deps.Libdl]]
uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"

[[deps.LinearAlgebra]]
deps = ["Libdl", "OpenBLAS_jll", "libblastrampoline_jll"]
uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"

[[deps.Logging]]
uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"

[[deps.MacroTools]]
deps = ["Markdown", "Random"]
git-tree-sha1 = "2fa9ee3e63fd3a4f7a9a4f4744a52f4856de82df"
uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
version = "0.5.13"

[[deps.Markdown]]
deps = ["Base64"]
uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"

[[deps.MbedTLS_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1"
version = "2.28.2+1"

[[deps.Missings]]
deps = ["DataAPI"]
git-tree-sha1 = "f66bdc5de519e8f8ae43bdc598782d35a25b1272"
uuid = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
version = "1.1.0"

[[deps.MozillaCACerts_jll]]
uuid = "14a3606d-f60d-562e-9121-12d972cd8159"
version = "2023.1.10"

[[deps.NVTX]]
deps = ["Colors", "JuliaNVTXCallbacks_jll", "Libdl", "NVTX_jll"]
git-tree-sha1 = "53046f0483375e3ed78e49190f1154fa0a4083a1"
uuid = "5da4648a-3479-48b8-97b9-01cb529c0a1f"
version = "0.3.4"

[[deps.NVTX_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "ce3269ed42816bf18d500c9f63418d4b0d9f5a3b"
uuid = "e98f9f5b-d649-5603-91fd-7774390e6439"
version = "3.1.0+2"

[[deps.NetworkOptions]]
uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908"
version = "1.2.0"

[[deps.OpenBLAS_jll]]
deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"]
uuid = "4536629a-c528-5b80-bd46-f80d51c5b363"
version = "0.3.23+4"

[[deps.OrderedCollections]]
git-tree-sha1 = "dfdf5519f235516220579f949664f1bf44e741c5"
uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
version = "1.6.3"

[[deps.Parsers]]
deps = ["Dates", "PrecompileTools", "UUIDs"]
git-tree-sha1 = "8489905bcdbcfac64d1daa51ca07c0d8f0283821"
uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
version = "2.8.1"

[[deps.Pkg]]
deps = ["Artifacts", "Dates", "Downloads", "FileWatching", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"]
uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
version = "1.10.0"

[[deps.PooledArrays]]
deps = ["DataAPI", "Future"]
git-tree-sha1 = "36d8b4b899628fb92c2749eb488d884a926614d3"
uuid = "2dfb63ee-cc39-5dd5-95bd-886bf059d720"
version = "1.4.3"

[[deps.PrecompileTools]]
deps = ["Preferences"]
git-tree-sha1 = "5aa36f7049a63a1528fe8f7c3f2113413ffd4e1f"
uuid = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
version = "1.2.1"

[[deps.Preferences]]
deps = ["TOML"]
git-tree-sha1 = "9306f6085165d270f7e3db02af26a400d580f5c6"
uuid = "21216c6a-2e73-6563-6e65-726566657250"
version = "1.4.3"

[[deps.PrettyTables]]
deps = ["Crayons", "LaTeXStrings", "Markdown", "PrecompileTools", "Printf", "Reexport", "StringManipulation", "Tables"]
git-tree-sha1 = "88b895d13d53b5577fd53379d913b9ab9ac82660"
uuid = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d"
version = "2.3.1"

[[deps.Printf]]
deps = ["Unicode"]
uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"

[[deps.REPL]]
deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"]
uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"

[[deps.Random]]
deps = ["SHA"]
uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"

[[deps.Random123]]
deps = ["Random", "RandomNumbers"]
git-tree-sha1 = "4743b43e5a9c4a2ede372de7061eed81795b12e7"
uuid = "74087812-796a-5b5d-8853-05524746bad3"
version = "1.7.0"

[[deps.RandomNumbers]]
deps = ["Random", "Requires"]
git-tree-sha1 = "043da614cc7e95c703498a491e2c21f58a2b8111"
uuid = "e6cf234a-135c-5ec9-84dd-332b85af5143"
version = "1.5.3"

[[deps.Reexport]]
git-tree-sha1 = "45e428421666073eab6f2da5c9d310d99bb12f9b"
uuid = "189a3867-3050-52da-a836-e630ba90ab69"
version = "1.2.2"

[[deps.Requires]]
deps = ["UUIDs"]
git-tree-sha1 = "838a3a4188e2ded87a4f9f184b4b0d78a1e91cb7"
uuid = "ae029012-a4dd-5104-9daa-d747884805df"
version = "1.3.0"

[[deps.SHA]]
uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
version = "0.7.0"

[[deps.Scratch]]
deps = ["Dates"]
git-tree-sha1 = "3bac05bc7e74a75fd9cba4295cde4045d9fe2386"
uuid = "6c6a2e73-6563-6170-7368-637461726353"
version = "1.2.1"

[[deps.SentinelArrays]]
deps = ["Dates", "Random"]
git-tree-sha1 = "0e7508ff27ba32f26cd459474ca2ede1bc10991f"
uuid = "91c51154-3ec4-41a3-a24f-3f23e20d615c"
version = "1.4.1"

[[deps.Serialization]]
uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"

[[deps.Sockets]]
uuid = "6462fe0b-24de-5631-8697-dd941f90decc"

[[deps.SortingAlgorithms]]
deps = ["DataStructures"]
git-tree-sha1 = "66e0a8e672a0bdfca2c3f5937efb8538b9ddc085"
uuid = "a2af1166-a08f-5f64-846c-94a0d3cef48c"
version = "1.2.1"

[[deps.SparseArrays]]
deps = ["Libdl", "LinearAlgebra", "Random", "Serialization", "SuiteSparse_jll"]
uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
version = "1.10.0"

[[deps.StaticArrays]]
deps = ["LinearAlgebra", "PrecompileTools", "Random", "StaticArraysCore"]
git-tree-sha1 = "bf074c045d3d5ffd956fa0a461da38a44685d6b2"
uuid = "90137ffa-7385-5640-81b9-e52037218182"
version = "1.9.3"

    [deps.StaticArrays.extensions]
    StaticArraysChainRulesCoreExt = "ChainRulesCore"
    StaticArraysStatisticsExt = "Statistics"

    [deps.StaticArrays.weakdeps]
    ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
    Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"

[[deps.StaticArraysCore]]
git-tree-sha1 = "36b3d696ce6366023a0ea192b4cd442268995a0d"
uuid = "1e83bf80-4336-4d27-bf5d-d5a4f845583c"
version = "1.4.2"

[[deps.Statistics]]
deps = ["LinearAlgebra", "SparseArrays"]
uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
version = "1.10.0"

[[deps.StringManipulation]]
deps = ["PrecompileTools"]
git-tree-sha1 = "a04cabe79c5f01f4d723cc6704070ada0b9d46d5"
uuid = "892a3eda-7b42-436c-8928-eab12a02cf0e"
version = "0.3.4"

[[deps.SuiteSparse_jll]]
deps = ["Artifacts", "Libdl", "libblastrampoline_jll"]
uuid = "bea87d4a-7f5b-5778-9afe-8cc45184846c"
version = "7.2.1+1"

[[deps.TOML]]
deps = ["Dates"]
uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
version = "1.0.3"

[[deps.TableTraits]]
deps = ["IteratorInterfaceExtensions"]
git-tree-sha1 = "c06b2f539df1c6efa794486abfb6ed2022561a39"
uuid = "3783bdb8-4a98-5b6b-af9a-565f29a5fe9c"
version = "1.0.1"

[[deps.Tables]]
deps = ["DataAPI", "DataValueInterfaces", "IteratorInterfaceExtensions", "LinearAlgebra", "OrderedCollections", "TableTraits"]
git-tree-sha1 = "cb76cf677714c095e535e3501ac7954732aeea2d"
uuid = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
version = "1.11.1"

[[deps.Tar]]
deps = ["ArgTools", "SHA"]
uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
version = "1.10.0"

[[deps.Test]]
deps = ["InteractiveUtils", "Logging", "Random", "Serialization"]
uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[[deps.TimerOutputs]]
deps = ["ExprTools", "Printf"]
git-tree-sha1 = "f548a9e9c490030e545f72074a41edfd0e5bcdd7"
uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
version = "0.5.23"

[[deps.UUIDs]]
deps = ["Random", "SHA"]
uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"

[[deps.Unicode]]
uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"

[[deps.UnsafeAtomics]]
git-tree-sha1 = "6331ac3440856ea1988316b46045303bef658278"
uuid = "013be700-e6cd-48c3-b4a1-df204f14c38f"
version = "0.2.1"

[[deps.UnsafeAtomicsLLVM]]
deps = ["LLVM", "UnsafeAtomics"]
git-tree-sha1 = "323e3d0acf5e78a56dfae7bd8928c989b4f3083e"
uuid = "d80eeb9a-aca5-4d75-85e5-170c8b632249"
version = "0.1.3"

[[deps.Zlib_jll]]
deps = ["Libdl"]
uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
version = "1.2.13+1"

[[deps.libblastrampoline_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "8e850b90-86db-534c-a0d3-1478176c7d93"
version = "5.8.0+1"

[[deps.nghttp2_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d"
version = "1.52.0+1"

[[deps.p7zip_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0"
version = "17.4.0+2"

Julia v1.9.4 Manifest.toml

# This file is machine-generated - editing it directly is not advised

julia_version = "1.9.4"
manifest_format = "2.0"
project_hash = "ea66756cd36b9071e136b2612971efcfb913d90c"

[[deps.AbstractFFTs]]
deps = ["LinearAlgebra"]
git-tree-sha1 = "d92ad398961a3ed262d8bf04a1a2b8340f915fef"
uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c"
version = "1.5.0"

    [deps.AbstractFFTs.extensions]
    AbstractFFTsChainRulesCoreExt = "ChainRulesCore"
    AbstractFFTsTestExt = "Test"

    [deps.AbstractFFTs.weakdeps]
    ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
    Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[[deps.Adapt]]
deps = ["LinearAlgebra", "Requires"]
git-tree-sha1 = "6a55b747d1812e699320963ffde36f1ebdda4099"
uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
version = "4.0.4"
weakdeps = ["StaticArrays"]

    [deps.Adapt.extensions]
    AdaptStaticArraysExt = "StaticArrays"

[[deps.ArgTools]]
uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f"
version = "1.1.1"

[[deps.Artifacts]]
uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"

[[deps.Atomix]]
deps = ["UnsafeAtomics"]
git-tree-sha1 = "c06a868224ecba914baa6942988e2f2aade419be"
uuid = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
version = "0.1.0"

[[deps.BFloat16s]]
deps = ["LinearAlgebra", "Printf", "Random", "Test"]
git-tree-sha1 = "dbf84058d0a8cbbadee18d25cf606934b22d7c66"
uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b"
version = "0.4.2"

[[deps.Base64]]
uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"

[[deps.CEnum]]
git-tree-sha1 = "389ad5c84de1ae7cf0e28e381131c98ea87d54fc"
uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82"
version = "0.5.0"

[[deps.CUDA]]
deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CUDA_Driver_jll", "CUDA_Runtime_Discovery", "CUDA_Runtime_jll", "Crayons", "DataFrames", "ExprTools", "GPUArrays", "GPUCompiler", "KernelAbstractions", "LLVM", "LLVMLoopInfo", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "NVTX", "Preferences", "PrettyTables", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "StaticArrays", "Statistics"]
git-tree-sha1 = "baa8ea7a1ea63316fa3feb454635215773c9c845"
uuid = "052768ef-5323-5732-b1bb-66c8b64840ba"
version = "5.2.0"

    [deps.CUDA.extensions]
    ChainRulesCoreExt = "ChainRulesCore"
    SpecialFunctionsExt = "SpecialFunctions"

    [deps.CUDA.weakdeps]
    ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
    SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"

[[deps.CUDA_Driver_jll]]
deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "Pkg"]
git-tree-sha1 = "d01bfc999768f0a31ed36f5d22a76161fc63079c"
uuid = "4ee394cb-3365-5eb0-8335-949819d2adfc"
version = "0.7.0+1"

[[deps.CUDA_Runtime_Discovery]]
deps = ["Libdl"]
git-tree-sha1 = "2cb12f6b2209f40a4b8967697689a47c50485490"
uuid = "1af6417a-86b4-443c-805f-a4643ffb695f"
version = "0.2.3"

[[deps.CUDA_Runtime_jll]]
deps = ["Artifacts", "CUDA_Driver_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"]
git-tree-sha1 = "8e25c009d2bf16c2c31a70a6e9e8939f7325cc84"
uuid = "76a88914-d11a-5bdc-97e0-2f5a05c973a2"
version = "0.11.1+0"

[[deps.ColorTypes]]
deps = ["FixedPointNumbers", "Random"]
git-tree-sha1 = "eb7f0f8307f71fac7c606984ea5fb2817275d6e4"
uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f"
version = "0.11.4"

[[deps.Colors]]
deps = ["ColorTypes", "FixedPointNumbers", "Reexport"]
git-tree-sha1 = "fc08e5930ee9a4e03f84bfb5211cb54e7769758a"
uuid = "5ae59095-9a9b-59fe-a467-6f913c188581"
version = "0.12.10"

[[deps.Compat]]
deps = ["TOML", "UUIDs"]
git-tree-sha1 = "c955881e3c981181362ae4088b35995446298b80"
uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
version = "4.14.0"
weakdeps = ["Dates", "LinearAlgebra"]

    [deps.Compat.extensions]
    CompatLinearAlgebraExt = "LinearAlgebra"

[[deps.CompilerSupportLibraries_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
version = "1.0.5+0"

[[deps.Crayons]]
git-tree-sha1 = "249fe38abf76d48563e2f4556bebd215aa317e15"
uuid = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f"
version = "4.1.1"

[[deps.DataAPI]]
git-tree-sha1 = "abe83f3a2f1b857aac70ef8b269080af17764bbe"
uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
version = "1.16.0"

[[deps.DataFrames]]
deps = ["Compat", "DataAPI", "DataStructures", "Future", "InlineStrings", "InvertedIndices", "IteratorInterfaceExtensions", "LinearAlgebra", "Markdown", "Missings", "PooledArrays", "PrecompileTools", "PrettyTables", "Printf", "REPL", "Random", "Reexport", "SentinelArrays", "SortingAlgorithms", "Statistics", "TableTraits", "Tables", "Unicode"]
git-tree-sha1 = "04c738083f29f86e62c8afc341f0967d8717bdb8"
uuid = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
version = "1.6.1"

[[deps.DataStructures]]
deps = ["Compat", "InteractiveUtils", "OrderedCollections"]
git-tree-sha1 = "0f4b5d62a88d8f59003e43c25a8a90de9eb76317"
uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
version = "0.18.18"

[[deps.DataValueInterfaces]]
git-tree-sha1 = "bfc1187b79289637fa0ef6d4436ebdfe6905cbd6"
uuid = "e2d170a0-9d28-54be-80f0-106bbe20a464"
version = "1.0.0"

[[deps.Dates]]
deps = ["Printf"]
uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"

[[deps.Downloads]]
deps = ["ArgTools", "FileWatching", "LibCURL", "NetworkOptions"]
uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
version = "1.6.0"

[[deps.ExprTools]]
git-tree-sha1 = "27415f162e6028e81c72b82ef756bf321213b6ec"
uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
version = "0.1.10"

[[deps.FileWatching]]
uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee"

[[deps.FixedPointNumbers]]
deps = ["Statistics"]
git-tree-sha1 = "335bfdceacc84c5cdf16aadc768aa5ddfc5383cc"
uuid = "53c48c17-4a7d-5ca2-90c5-79b7896eea93"
version = "0.8.4"

[[deps.Future]]
deps = ["Random"]
uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820"

[[deps.GPUArrays]]
deps = ["Adapt", "GPUArraysCore", "LLVM", "LinearAlgebra", "Printf", "Random", "Reexport", "Serialization", "Statistics"]
git-tree-sha1 = "47e4686ec18a9620850bad110b79966132f14283"
uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
version = "10.0.2"

[[deps.GPUArraysCore]]
deps = ["Adapt"]
git-tree-sha1 = "ec632f177c0d990e64d955ccc1b8c04c485a0950"
uuid = "46192b85-c4d5-4398-a991-12ede77f4527"
version = "0.1.6"

[[deps.GPUCompiler]]
deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Scratch", "TimerOutputs", "UUIDs"]
git-tree-sha1 = "a846f297ce9d09ccba02ead0cae70690e072a119"
uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
version = "0.25.0"

[[deps.InlineStrings]]
deps = ["Parsers"]
git-tree-sha1 = "9cc2baf75c6d09f9da536ddf58eb2f29dedaf461"
uuid = "842dd82b-1e85-43dc-bf29-5d0ee9dffc48"
version = "1.4.0"

[[deps.InteractiveUtils]]
deps = ["Markdown"]
uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"

[[deps.InvertedIndices]]
git-tree-sha1 = "0dc7b50b8d436461be01300fd8cd45aa0274b038"
uuid = "41ab1584-1d38-5bbf-9106-f11c6c58b48f"
version = "1.3.0"

[[deps.IteratorInterfaceExtensions]]
git-tree-sha1 = "a3f24677c21f5bbe9d2a714f95dcd58337fb2856"
uuid = "82899510-4779-5014-852e-03e436cf321d"
version = "1.0.0"

[[deps.JLLWrappers]]
deps = ["Artifacts", "Preferences"]
git-tree-sha1 = "7e5d6779a1e09a36db2a7b6cff50942a0a7d0fca"
uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210"
version = "1.5.0"

[[deps.JuliaNVTXCallbacks_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "af433a10f3942e882d3c671aacb203e006a5808f"
uuid = "9c1d0b0a-7046-5b2e-a33f-ea22f176ac7e"
version = "0.2.1+0"

[[deps.KernelAbstractions]]
deps = ["Adapt", "Atomix", "InteractiveUtils", "LinearAlgebra", "MacroTools", "PrecompileTools", "Requires", "SparseArrays", "StaticArrays", "UUIDs", "UnsafeAtomics", "UnsafeAtomicsLLVM"]
git-tree-sha1 = "ed7167240f40e62d97c1f5f7735dea6de3cc5c49"
uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
version = "0.9.18"

    [deps.KernelAbstractions.extensions]
    EnzymeExt = "EnzymeCore"

    [deps.KernelAbstractions.weakdeps]
    EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"

[[deps.LLVM]]
deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Preferences", "Printf", "Requires", "Unicode"]
git-tree-sha1 = "ab01dde107f21aa76144d0771dccc08f152ccac7"
uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
version = "6.6.2"
weakdeps = ["BFloat16s"]

    [deps.LLVM.extensions]
    BFloat16sExt = "BFloat16s"

[[deps.LLVMExtra_jll]]
deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"]
git-tree-sha1 = "88b916503aac4fb7f701bb625cd84ca5dd1677bc"
uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab"
version = "0.0.29+0"

[[deps.LLVMLoopInfo]]
git-tree-sha1 = "2e5c102cfc41f48ae4740c7eca7743cc7e7b75ea"
uuid = "8b046642-f1f6-4319-8d3c-209ddc03c586"
version = "1.0.0"

[[deps.LaTeXStrings]]
git-tree-sha1 = "50901ebc375ed41dbf8058da26f9de442febbbec"
uuid = "b964fa9f-0449-5b57-a5c2-d3ea65f4040f"
version = "1.3.1"

[[deps.LazyArtifacts]]
deps = ["Artifacts", "Pkg"]
uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3"

[[deps.LibCURL]]
deps = ["LibCURL_jll", "MozillaCACerts_jll"]
uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21"
version = "0.6.4"

[[deps.LibCURL_jll]]
deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"]
uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0"
version = "8.4.0+0"

[[deps.LibGit2]]
deps = ["Base64", "NetworkOptions", "Printf", "SHA"]
uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"

[[deps.LibSSH2_jll]]
deps = ["Artifacts", "Libdl", "MbedTLS_jll"]
uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8"
version = "1.11.0+1"

[[deps.Libdl]]
uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"

[[deps.LinearAlgebra]]
deps = ["Libdl", "OpenBLAS_jll", "libblastrampoline_jll"]
uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"

[[deps.Logging]]
uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"

[[deps.MacroTools]]
deps = ["Markdown", "Random"]
git-tree-sha1 = "2fa9ee3e63fd3a4f7a9a4f4744a52f4856de82df"
uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
version = "0.5.13"

[[deps.Markdown]]
deps = ["Base64"]
uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"

[[deps.MbedTLS_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1"
version = "2.28.2+0"

[[deps.Missings]]
deps = ["DataAPI"]
git-tree-sha1 = "f66bdc5de519e8f8ae43bdc598782d35a25b1272"
uuid = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
version = "1.1.0"

[[deps.MozillaCACerts_jll]]
uuid = "14a3606d-f60d-562e-9121-12d972cd8159"
version = "2022.10.11"

[[deps.NVTX]]
deps = ["Colors", "JuliaNVTXCallbacks_jll", "Libdl", "NVTX_jll"]
git-tree-sha1 = "53046f0483375e3ed78e49190f1154fa0a4083a1"
uuid = "5da4648a-3479-48b8-97b9-01cb529c0a1f"
version = "0.3.4"

[[deps.NVTX_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "ce3269ed42816bf18d500c9f63418d4b0d9f5a3b"
uuid = "e98f9f5b-d649-5603-91fd-7774390e6439"
version = "3.1.0+2"

[[deps.NetworkOptions]]
uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908"
version = "1.2.0"

[[deps.OpenBLAS_jll]]
deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"]
uuid = "4536629a-c528-5b80-bd46-f80d51c5b363"
version = "0.3.21+4"

[[deps.OrderedCollections]]
git-tree-sha1 = "dfdf5519f235516220579f949664f1bf44e741c5"
uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
version = "1.6.3"

[[deps.Parsers]]
deps = ["Dates", "PrecompileTools", "UUIDs"]
git-tree-sha1 = "8489905bcdbcfac64d1daa51ca07c0d8f0283821"
uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
version = "2.8.1"

[[deps.Pkg]]
deps = ["Artifacts", "Dates", "Downloads", "FileWatching", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"]
uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
version = "1.9.2"

[[deps.PooledArrays]]
deps = ["DataAPI", "Future"]
git-tree-sha1 = "36d8b4b899628fb92c2749eb488d884a926614d3"
uuid = "2dfb63ee-cc39-5dd5-95bd-886bf059d720"
version = "1.4.3"

[[deps.PrecompileTools]]
deps = ["Preferences"]
git-tree-sha1 = "5aa36f7049a63a1528fe8f7c3f2113413ffd4e1f"
uuid = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
version = "1.2.1"

[[deps.Preferences]]
deps = ["TOML"]
git-tree-sha1 = "9306f6085165d270f7e3db02af26a400d580f5c6"
uuid = "21216c6a-2e73-6563-6e65-726566657250"
version = "1.4.3"

[[deps.PrettyTables]]
deps = ["Crayons", "LaTeXStrings", "Markdown", "PrecompileTools", "Printf", "Reexport", "StringManipulation", "Tables"]
git-tree-sha1 = "88b895d13d53b5577fd53379d913b9ab9ac82660"
uuid = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d"
version = "2.3.1"

[[deps.Printf]]
deps = ["Unicode"]
uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"

[[deps.REPL]]
deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"]
uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"

[[deps.Random]]
deps = ["SHA", "Serialization"]
uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"

[[deps.Random123]]
deps = ["Random", "RandomNumbers"]
git-tree-sha1 = "4743b43e5a9c4a2ede372de7061eed81795b12e7"
uuid = "74087812-796a-5b5d-8853-05524746bad3"
version = "1.7.0"

[[deps.RandomNumbers]]
deps = ["Random", "Requires"]
git-tree-sha1 = "043da614cc7e95c703498a491e2c21f58a2b8111"
uuid = "e6cf234a-135c-5ec9-84dd-332b85af5143"
version = "1.5.3"

[[deps.Reexport]]
git-tree-sha1 = "45e428421666073eab6f2da5c9d310d99bb12f9b"
uuid = "189a3867-3050-52da-a836-e630ba90ab69"
version = "1.2.2"

[[deps.Requires]]
deps = ["UUIDs"]
git-tree-sha1 = "838a3a4188e2ded87a4f9f184b4b0d78a1e91cb7"
uuid = "ae029012-a4dd-5104-9daa-d747884805df"
version = "1.3.0"

[[deps.SHA]]
uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
version = "0.7.0"

[[deps.Scratch]]
deps = ["Dates"]
git-tree-sha1 = "3bac05bc7e74a75fd9cba4295cde4045d9fe2386"
uuid = "6c6a2e73-6563-6170-7368-637461726353"
version = "1.2.1"

[[deps.SentinelArrays]]
deps = ["Dates", "Random"]
git-tree-sha1 = "0e7508ff27ba32f26cd459474ca2ede1bc10991f"
uuid = "91c51154-3ec4-41a3-a24f-3f23e20d615c"
version = "1.4.1"

[[deps.Serialization]]
uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"

[[deps.Sockets]]
uuid = "6462fe0b-24de-5631-8697-dd941f90decc"

[[deps.SortingAlgorithms]]
deps = ["DataStructures"]
git-tree-sha1 = "66e0a8e672a0bdfca2c3f5937efb8538b9ddc085"
uuid = "a2af1166-a08f-5f64-846c-94a0d3cef48c"
version = "1.2.1"

[[deps.SparseArrays]]
deps = ["Libdl", "LinearAlgebra", "Random", "Serialization", "SuiteSparse_jll"]
uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"

[[deps.StaticArrays]]
deps = ["LinearAlgebra", "PrecompileTools", "Random", "StaticArraysCore"]
git-tree-sha1 = "bf074c045d3d5ffd956fa0a461da38a44685d6b2"
uuid = "90137ffa-7385-5640-81b9-e52037218182"
version = "1.9.3"

    StaticArraysStatisticsExt = "Statistics"

    [deps.StaticArrays.weakdeps]
    ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
    Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"

[[deps.StaticArraysCore]]
    [deps.StaticArrays.extensions]
    StaticArraysChainRulesCoreExt = "ChainRulesCore"
    StaticArraysStatisticsExt = "Statistics"

    [deps.StaticArrays.weakdeps]
    ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
    Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"

[[deps.StaticArraysCore]]
git-tree-sha1 = "36b3d696ce6366023a0ea192b4cd442268995a0d"
uuid = "1e83bf80-4336-4d27-bf5d-d5a4f845583c"
version = "1.4.2"

[[deps.Statistics]]
deps = ["LinearAlgebra", "SparseArrays"]
uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
version = "1.9.0"

[[deps.StringManipulation]]
deps = ["PrecompileTools"]
git-tree-sha1 = "a04cabe79c5f01f4d723cc6704070ada0b9d46d5"
uuid = "892a3eda-7b42-436c-8928-eab12a02cf0e"
version = "0.3.4"

[[deps.SuiteSparse_jll]]
deps = ["Artifacts", "Libdl", "Pkg", "libblastrampoline_jll"]
uuid = "bea87d4a-7f5b-5778-9afe-8cc45184846c"
version = "5.10.1+6"

[[deps.TOML]]
deps = ["Dates"]
uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
version = "1.0.3"

[[deps.TableTraits]]
deps = ["IteratorInterfaceExtensions"]
git-tree-sha1 = "c06b2f539df1c6efa794486abfb6ed2022561a39"
uuid = "3783bdb8-4a98-5b6b-af9a-565f29a5fe9c"
version = "1.0.1"

[[deps.Tables]]
deps = ["DataAPI", "DataValueInterfaces", "IteratorInterfaceExtensions", "LinearAlgebra", "OrderedCollections", "TableTraits"]
git-tree-sha1 = "cb76cf677714c095e535e3501ac7954732aeea2d"
uuid = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
version = "1.11.1"

[[deps.Tar]]
deps = ["ArgTools", "SHA"]
uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
version = "1.10.0"

[[deps.Test]]
deps = ["InteractiveUtils", "Logging", "Random", "Serialization"]
uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[[deps.TimerOutputs]]
deps = ["ExprTools", "Printf"]
git-tree-sha1 = "f548a9e9c490030e545f72074a41edfd0e5bcdd7"
uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
version = "0.5.23"

[[deps.UUIDs]]
deps = ["Random", "SHA"]
uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"

[[deps.Unicode]]
uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"

[[deps.UnsafeAtomics]]
git-tree-sha1 = "6331ac3440856ea1988316b46045303bef658278"
uuid = "013be700-e6cd-48c3-b4a1-df204f14c38f"
version = "0.2.1"

[[deps.UnsafeAtomicsLLVM]]
deps = ["LLVM", "UnsafeAtomics"]
git-tree-sha1 = "323e3d0acf5e78a56dfae7bd8928c989b4f3083e"
uuid = "d80eeb9a-aca5-4d75-85e5-170c8b632249"
version = "0.1.3"

[[deps.Zlib_jll]]
deps = ["Libdl"]
uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
version = "1.2.13+0"

[[deps.libblastrampoline_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "8e850b90-86db-534c-a0d3-1478176c7d93"
version = "5.8.0+0"

[[deps.nghttp2_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d"
version = "1.52.0+1"

[[deps.p7zip_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0"
version = "17.4.0+0"

Expected behavior

I expect the code to compile on both versions of julia.

Version info

Details on Julia:

julia> versioninfo(verbose=true)
Julia Version 1.10.2
Commit bd47eca2c8a (2024-03-01 10:14 UTC)
Build Info:
  Official https://julialang.org/ release
Platform Info:
  OS: Linux (x86_64-linux-gnu)
  uname: Linux 6.1.82 #1-NixOS SMP PREEMPT_DYNAMIC Fri Mar 15 18:27:50 UTC 2024 x86_64 unknown
  CPU: AMD Ryzen 9 7950X 16-Core Processor: 
                 speed         user         nice          sys         idle          irq
       #1-32  3111 MHz       9230 s        445 s       2365 s    4342371 s          0 s
  Memory: 61.937007904052734 GB (57126.453125 MB free)
  Uptime: 13620.16 sec
  Load Avg:  0.13  0.1  0.09
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-15.0.7 (ORCJIT, znver3)
Threads: 1 default, 0 interactive, 1 GC (on 32 virtual cores)
Environment:
  LD_LIBRARY_PATH = /nix/store/1bapla8zwlx128i42wi08zbiibwk6gm3-libglvnd-1.7.0/lib:/nix/store/chsnmb6qwq9hbjc4xp6fxlsvbxxqaawd-nvidia-x11-545.29.02-nixGL/lib:/nix/store/24d236bvx1yn0s2l2iymlh6bajrc0kgk-nvidia-x11-545.29.02-nixGL-lib32/lib:/nix/store/gaad1akyzhij4pigidhj2zk8p9v1rjy6-libglvnd-1.7.0/lib:/nix/store/j2k44gyws2wlx3xrkv54zv0nhjihr0a5-pipewire-1.0.1-jack/lib
  NIX_LD_LIBRARY_PATH = /run/current-system/sw/share/nix-ld/lib
  JAVA_HOME = /nix/store/iqz8lfhx69qxb0ydixfldsf0wiw87q34-openjdk-21+35/lib/openjdk
  XCURSOR_PATH = /home/lucas/.icons:/home/lucas/.local/share/icons:/home/lucas/.nix-profile/share/icons:/home/lucas/.nix-profile/share/pixmaps:/nix/profile/share/icons:/nix/profile/share/pixmaps:/home/lucas/.local/state/nix/profile/share/icons:/home/lucas/.local/state/nix/profile/share/pixmaps:/etc/profiles/per-user/lucas/share/icons:/etc/profiles/per-user/lucas/share/pixmaps:/nix/var/nix/profiles/default/share/icons:/nix/var/nix/profiles/default/share/pixmaps:/run/current-system/sw/share/icons:/run/current-system/sw/share/pixmaps:/home/lucas/.nix-profile/share/icons
  TMUX_PLUGIN_MANAGER_PATH = /home/lucas/.tmux/plugins/
  GNUPGHOME = /home/lucas/.gnupg
  NIX_PATH = nixpkgs=/nix/store/dc4k1xnrq4g92n2n9j1nfvahlnq4gr5l-source
  HOME = /home/lucas
  INFOPATH = /home/lucas/.nix-profile/info:/home/lucas/.nix-profile/share/info:/nix/profile/info:/nix/profile/share/info:/home/lucas/.local/state/nix/profile/info:/home/lucas/.local/state/nix/profile/share/info:/etc/profiles/per-user/lucas/info:/etc/profiles/per-user/lucas/share/info:/nix/var/nix/profiles/default/info:/nix/var/nix/profiles/default/share/info:/run/current-system/sw/info:/run/current-system/sw/share/info
  TERM = tmux-256color
  GTK_PATH = /home/lucas/.nix-profile/lib/gtk-2.0:/home/lucas/.nix-profile/lib/gtk-3.0:/home/lucas/.nix-profile/lib/gtk-4.0:/nix/profile/lib/gtk-2.0:/nix/profile/lib/gtk-3.0:/nix/profile/lib/gtk-4.0:/home/lucas/.local/state/nix/profile/lib/gtk-2.0:/home/lucas/.local/state/nix/profile/lib/gtk-3.0:/home/lucas/.local/state/nix/profile/lib/gtk-4.0:/etc/profiles/per-user/lucas/lib/gtk-2.0:/etc/profiles/per-user/lucas/lib/gtk-3.0:/etc/profiles/per-user/lucas/lib/gtk-4.0:/nix/var/nix/profiles/default/lib/gtk-2.0:/nix/var/nix/profiles/default/lib/gtk-3.0:/nix/var/nix/profiles/default/lib/gtk-4.0:/run/current-system/sw/lib/gtk-2.0:/run/current-system/sw/lib/gtk-3.0:/run/current-system/sw/lib/gtk-4.0
  QTWEBKIT_PLUGIN_PATH = /home/lucas/.nix-profile/lib/mozilla/plugins/:/nix/profile/lib/mozilla/plugins/:/home/lucas/.local/state/nix/profile/lib/mozilla/plugins/:/etc/profiles/per-user/lucas/lib/mozilla/plugins/:/nix/var/nix/profiles/default/lib/mozilla/plugins/:/run/current-system/sw/lib/mozilla/plugins/
  LIBEXEC_PATH = /home/lucas/.nix-profile/lib/libexec:/nix/profile/lib/libexec:/home/lucas/.local/state/nix/profile/lib/libexec:/etc/profiles/per-user/lucas/lib/libexec:/nix/var/nix/profiles/default/lib/libexec:/run/current-system/sw/lib/libexec
  PATH = /run/wrappers/bin:/home/lucas/.nix-profile/bin:/nix/profile/bin:/home/lucas/.local/state/nix/profile/bin:/etc/profiles/per-user/lucas/bin:/nix/var/nix/profiles/default/bin:/run/current-system/sw/bin:/home/lucas/.local/bin:/home/lucas/.config/zsh/plugins/fzf-zsh-plugin/bin:/home/lucas/.fzf/bin
  NVTX_INJECTION64_PATH = /home/lucas/.julia/artifacts/c0e6b8fff2621303ace1cc360b7fca676b4e28fd/lib/libcupti.so

Details on CUDA:

julia> CUDA.versioninfo()
CUDA runtime 12.3, artifact installation
CUDA driver 12.3
NVIDIA driver 545.29.2

CUDA libraries: 
- CUBLAS: 12.3.4
- CURAND: 10.3.4
- CUFFT: 11.0.12
- CUSOLVER: 11.5.4
- CUSPARSE: 12.2.0
- CUPTI: 21.0.0
- NVML: 12.0.0+545.29.2

Julia packages: 
- CUDA: 5.2.0
- CUDA_Driver_jll: 0.7.0+1
- CUDA_Runtime_jll: 0.11.1+0

Toolchain:
- Julia: 1.10.2
- LLVM: 15.0.7

1 device:
  0: NVIDIA GeForce RTX 4090 (sm_89, 21.084 GiB / 22.488 GiB available)

Additional context
Both versions of julia give the same @device_code_warntype.

julia> @device_code_warntype @cuda demo(A)
PTX CompilerJob of MethodInstance for demo(::CuDeviceVector{Float32, 1}) for sm_86

MethodInstance for demo(::CuDeviceVector{Float32, 1})
  from demo(A) @ Main ~/research/code/julia/bug/demo.jl:6
Arguments
  #self#::Core.Const(demo)
  A::CuDeviceVector{Float32, 1}
Locals
  @_3::Union{Nothing, Tuple{Int64, Int64}}
  B::MVector{3, SVector{1, Float32}}
  o::Int64
Body::Nothing
1 ─ %1  = Main.MVector::Core.Const(MVector)
│   %2  = Core.apply_type(Main.SVector, 1, Main.Float32)::Core.Const(SVector{1, Float32})
│   %3  = Core.apply_type(%1, 3, %2)::Core.Const(MVector{3, SVector{1, Float32}})
│         (B = (%3)(Main.undef))
│   %5  = (1:3)::Core.Const(1:3)
│         (@_3 = Base.iterate(%5))
│   %7  = (@_3::Core.Const((1, 1)) === nothing)::Core.Const(false)
│   %8  = Base.not_int(%7)::Core.Const(true)
└──       goto #4 if not %8
2 ┄ %10 = @_3::Tuple{Int64, Int64}
│         (o = Core.getfield(%10, 1))
│   %12 = Core.getfield(%10, 2)::Int64
│   %13 = Base.getindex(A, o)::Float32
│   %14 = Core.apply_type(Main.SVector, 1, Main.Float32)::Core.Const(SVector{1, Float32})
│   %15 = (%14)(1.0f0)::Core.Const(Float32[1.0])
│   %16 = (%13 * %15)::SVector{1, Float32}
│         Base.setindex!(B, %16, o)
│         (@_3 = Base.iterate(%5, %12))
│   %19 = (@_3 === nothing)::Bool
│   %20 = Base.not_int(%19)::Bool
└──       goto #4 if not %20
3 ─       goto #2
4 ┄       return Main.nothing

MArray is an unreliable type that makes your code depends on the optimizations of the LLVM-level allocopt pass. You should be using an array type that doesn't rely on mutable semantics that are then optimized away.

That said, with this specific example it looks like Base codegen for Vector{Float32} does result in optimized code, so it may be interesting to look into why.

Thanks for taking a look! Do you have a suggested array type?

This example is a reduction from a Kernel Abstractions kernel https://github.com/HorribleSanity/Raven.jl/blob/971310fdb4a691281b33b0a6114f8cd2c4d1f01f/src/lobattocells.jl#L2441-L2583
where I am using the MArray via @private. If this is not recommended, I am confused why KernelAbstractions is using it. Will this not cause problems when JuliaGPU/GPUArrays.jl#525 lands?

That said, I have found a mutable array of registers quite useful to minimize code duplication, e.g,

CUDA.jl/perf/volumerhs.jl

Lines 96 to 223 in 7f725c0

r_rhsρ = MArray{Tuple{Nq}, eltype(rhs)}(undef)
r_rhsU = MArray{Tuple{Nq}, eltype(rhs)}(undef)
r_rhsV = MArray{Tuple{Nq}, eltype(rhs)}(undef)
r_rhsW = MArray{Tuple{Nq}, eltype(rhs)}(undef)
r_rhsE = MArray{Tuple{Nq}, eltype(rhs)}(undef)
e = blockIdx().x
j = threadIdx().y
i = threadIdx().x
@inbounds begin
for k in 1:Nq
r_rhsρ[k] = zero(eltype(rhs))
r_rhsU[k] = zero(eltype(rhs))
r_rhsV[k] = zero(eltype(rhs))
r_rhsW[k] = zero(eltype(rhs))
r_rhsE[k] = zero(eltype(rhs))
end
# fetch D into shared
s_D[i, j] = D[i, j]
@unroll for k in 1:Nq
sync_threads()
# Load values will need into registers
MJ = vgeo[i, j, k, _MJ, e]
ξx, ξy, ξz = vgeo[i,j,k,_ξx,e], vgeo[i,j,k,_ξy,e], vgeo[i,j,k,_ξz,e]
ηx, ηy, ηz = vgeo[i,j,k,_ηx,e], vgeo[i,j,k,_ηy,e], vgeo[i,j,k,_ηz,e]
ζx, ζy, ζz = vgeo[i,j,k,_ζx,e], vgeo[i,j,k,_ζy,e], vgeo[i,j,k,_ζz,e]
z = vgeo[i,j,k,_z,e]
U, V, W = Q[i, j, k, _U, e], Q[i, j, k, _V, e], Q[i, j, k, _W, e]
ρ, E = Q[i, j, k, _ρ, e], Q[i, j, k, _E, e]
# GPU performance trick
# Allow optimizations to use the reciprocal of an argument rather than perform division.
# IEEE floating-point division is implemented as a function call
ρinv = rcp(ρ)
ρ2inv = rcp(2ρ)
# ρ2inv = 0.5f0 * pinv
P = gdm1*(E - (U^2 + V^2 + W^2)*ρ2inv - ρ*gravity*z)
fluxρ_x = U
fluxU_x = ρinv * U * U + P
fluxV_x = ρinv * U * V
fluxW_x = ρinv * U * W
fluxE_x = ρinv * U * (E + P)
fluxρ_y = V
fluxU_y = ρinv * V * U
fluxV_y = ρinv * V * V + P
fluxW_y = ρinv * V * W
fluxE_y = ρinv * V * (E + P)
fluxρ_z = W
fluxU_z = ρinv * W * U
fluxV_z = ρinv * W * V
fluxW_z = ρinv * W * W + P
fluxE_z = ρinv * W * (E + P)
s_F[i, j, _ρ] = MJ * (ξx * fluxρ_x + ξy * fluxρ_y + ξz * fluxρ_z)
s_F[i, j, _U] = MJ * (ξx * fluxU_x + ξy * fluxU_y + ξz * fluxU_z)
s_F[i, j, _V] = MJ * (ξx * fluxV_x + ξy * fluxV_y + ξz * fluxV_z)
s_F[i, j, _W] = MJ * (ξx * fluxW_x + ξy * fluxW_y + ξz * fluxW_z)
s_F[i, j, _E] = MJ * (ξx * fluxE_x + ξy * fluxE_y + ξz * fluxE_z)
s_G[i, j, _ρ] = MJ * (ηx * fluxρ_x + ηy * fluxρ_y + ηz * fluxρ_z)
s_G[i, j, _U] = MJ * (ηx * fluxU_x + ηy * fluxU_y + ηz * fluxU_z)
s_G[i, j, _V] = MJ * (ηx * fluxV_x + ηy * fluxV_y + ηz * fluxV_z)
s_G[i, j, _W] = MJ * (ηx * fluxW_x + ηy * fluxW_y + ηz * fluxW_z)
s_G[i, j, _E] = MJ * (ηx * fluxE_x + ηy * fluxE_y + ηz * fluxE_z)
r_Hρ = MJ * (ζx * fluxρ_x + ζy * fluxρ_y + ζz * fluxρ_z)
r_HU = MJ * (ζx * fluxU_x + ζy * fluxU_y + ζz * fluxU_z)
r_HV = MJ * (ζx * fluxV_x + ζy * fluxV_y + ζz * fluxV_z)
r_HW = MJ * (ζx * fluxW_x + ζy * fluxW_y + ζz * fluxW_z)
r_HE = MJ * (ζx * fluxE_x + ζy * fluxE_y + ζz * fluxE_z)
# one shared access per 10 flops
for n = 1:Nq
Dkn = s_D[k, n]
r_rhsρ[n] += Dkn * r_Hρ
r_rhsU[n] += Dkn * r_HU
r_rhsV[n] += Dkn * r_HV
r_rhsW[n] += Dkn * r_HW
r_rhsE[n] += Dkn * r_HE
end
r_rhsW[k] -= MJ * ρ * gravity
sync_threads()
# loop of ξ-grid lines
@unroll for n = 1:Nq
Dni = s_D[n, i]
Dnj = s_D[n, j]
r_rhsρ[k] += Dni * s_F[n, j, _ρ]
r_rhsρ[k] += Dnj * s_G[i, n, _ρ]
r_rhsU[k] += Dni * s_F[n, j, _U]
r_rhsU[k] += Dnj * s_G[i, n, _U]
r_rhsV[k] += Dni * s_F[n, j, _V]
r_rhsV[k] += Dnj * s_G[i, n, _V]
r_rhsW[k] += Dni * s_F[n, j, _W]
r_rhsW[k] += Dnj * s_G[i, n, _W]
r_rhsE[k] += Dni * s_F[n, j, _E]
r_rhsE[k] += Dnj * s_G[i, n, _E]
end
end # k
@unroll for k in 1:Nq
MJI = vgeo[i, j, k, _MJI, e]
# Updates are a performance bottleneck
# primary source of stall_long_sb
rhs[i, j, k, _U, e] += MJI*r_rhsU[k]
rhs[i, j, k, _V, e] += MJI*r_rhsV[k]
rhs[i, j, k, _W, e] += MJI*r_rhsW[k]
rhs[i, j, k, _ρ, e] += MJI*r_rhsρ[k]
rhs[i, j, k, _E, e] += MJI*r_rhsE[k]
end
end

Are you saying there is not a recommended way of achieving this?

cc: @vchuravy @leios

If this is not recommended, I am confused why KernelAbstractions is using it.

Because we don't have a good way to express a mutable, on-stack array type in Julia right now. IIUC, ImmutableArrays were supposed to solve this, but that effort stalled.

A safer way is to use an immutable array type and Base.setindex, possibly using a convenience macro like Setfield.jl. That's what I did in https://github.com/JuliaGPU/GemmKernels.jl/blob/master/src/array.jl, and should probably be ported to StaticArrays.jl' SArray.jl.

In the mean time, using MArray isn't bad per se, you just have to be aware of its fragility, meaning that minor changes to Julia's codegen can result in LLVM becoming confused and your previously stack-allocated MArray now becoming heap allocated (and as a result GPU incompatible).

# If @inbounds is added to this loop the code compiles

I think the core issue here is that the boundserror for MArray captures the object. Causing it to escape.
In KA I am very cautious about this using @inbounds to avoid this issue, since as Tim said we are relying on an optimization here.

Because we don't have a good way to express a mutable, on-stack array type in Julia right now. IIUC, ImmutableArrays were supposed to solve this, but that effort stalled.

That is too bad. That is what I really want.

A safer way is to use an immutable array type and Base.setindex, possibly using a convenience macro like Setfield.jl.

Thanks for the references. They seem reasonable for the uses I have of MArrays inside CUDA kernels. Using them for KA @private seems like it would need an API change.

# If @inbounds is added to this loop the code compiles

I think the core issue here is that the boundserror for MArray captures the object. Causing it to escape. In KA I am very cautious about this using @inbounds to avoid this issue, since as Tim said we are relying on an optimization here.

I think this is the issue as well. Two questions:

  • Am I using @private memory incorrectly in my kernel? The issue presents itself when I am running the tests where bounds checking is turned on everwhere.
  • Is there a better way to write the MArray bounds check [1] so that a reference to the array is not captured?

The issue presents itself when I am running the tests where bounds checking is turned on everwhere.

Yeah globally turning on bounds checking with break @private :/

Yeah as long as checkbounds doesn't require v. Instead one could just pass the axes

This pr #2314 fixes my current issue. I understand that is it probably not the most desirable solution but wonder if it is something worth adding because of KA's current reliance on MArray for @private.

I'm not opposed to such a quirk if it's required, but I'm surprised it's necessary in the first place as the CPU compilation pipeline does properly optimize everything away already.

With --checkbounds=true we will eventually see a call to throw_boundserror(A, I). Which is @noinline and creates a BoundError that captures A.

Last time I checked this would also happen on the CPU pipeline and is a side-effect of our BoundsError design.

With --checkbounds=true we will eventually see a call to throw_boundserror(A, I).

Sure, for that case the quirk is probably still useful. However, without that the MWE from this thread does only reproduce using GPUCompiler.jl, so might be worth looking into.

I haven't found the difference yet, but I suspect it's in the optimization pipeline.

Pre-optimizations:

1.10

   %newstruct = call noalias nonnull dereferenceable(12) {} addrspace(10)* @julia.gc_alloc_obj({}** %current_task1, i64 12, {} addrspace(10)* addrspacecast ({}* inttoptr (i64 139432096781136 to {}*) to {} addrspace(10)*)) #0, !dbg !75
   %7 = addrspacecast {} addrspace(10)* %newstruct to {} addrspace(11)*, !dbg !75
...

L36:                                              ; preds = %L30
    %30 = addrspacecast [1 x i64]* %newstruct8 to [1 x i64] addrspace(11)*, !dbg !185
    call fastcc void @julia__throw_boundserror_1165({} addrspace(10)* %newstruct, [1 x i64] addrspace(11)* nocapture readonly %30) #7, !dbg !185
    call void @llvm.trap(), !dbg !185
    call void asm sideeffect "exit;", ""(), !dbg !185
    unreachable, !dbg !185

1.9

   %10 = call noalias nonnull {} addrspace(10)* @julia.gc_alloc_obj({}** %current_task1, i64 12, {} addrspace(10)* addrspacecast ({}* inttoptr (i64 135179767675600 to {}*) to {} addrspace(10)*)) #2, !dbg !77
   %11 = addrspacecast {} addrspace(10)* %10 to {} addrspace(11)*, !dbg !77
...

L36:                                              ; preds = %L30
; ││ @ abstractarray.jl:709 within `checkbounds`
    %44 = addrspacecast [1 x i64]* %5 to [1 x i64] addrspace(11)*, !dbg !188
    call fastcc void @julia__throw_boundserror_2843({} addrspace(10)* %10, [1 x i64] addrspace(11)* nocapture readonly %44) #8, !dbg !188
    call void @llvm.trap(), !dbg !188
    call void asm sideeffect "exit;", ""(), !dbg !188
    unreachable, !dbg !188

Post-opt

1.10

;  @ REPL[4]:2 within `demo`
; ┌ @ /home/vchuravy/.julia/packages/StaticArraysCore/P6PH7/src/StaticArraysCore.jl:197 within `MArray`
   %2 = call fastcc {}* @gpu_gc_pool_alloc({ i64, i32 } %state, i64 12), !dbg !140
; └
...
L36:                                              ; preds = %L20.postloop
  %9 = getelementptr inbounds [1 x i64], [1 x i64]* %newstruct8, i64 0, i64 0
;  @ REPL[4]:6 within `demo`
; ┌ @ /home/vchuravy/.julia/packages/StaticArrays/EHHaF/src/MArray.jl:31 within `setindex!`
; │┌ @ abstractarray.jl:700 within `checkbounds`
    store i64 %value_phi.postloop, i64* %9, align 8, !dbg !189, !tbaa !155, !alias.scope !159, !noalias !162
    %10 = call {}** @julia.get_gc_frame_slot({}** nonnull %gcframe, i32 0)
    store {}* %2, {}** %10, align 8
; ││ @ abstractarray.jl:702 within `checkbounds`
    call fastcc void @julia__throw_boundserror_1584({ i64, i32 } %state), !dbg !191
    call void @llvm.trap(), !dbg !191
    call void asm sideeffect "exit;", ""(), !dbg !191
    unreachable, !dbg !191

1.9

;  @ REPL[4]:2 within `demo`
; ┌ @ /home/vchuravy/.julia/packages/StaticArraysCore/P6PH7/src/StaticArraysCore.jl:197 within `MArray`
   %4 = call fastcc {}* @gpu_gc_pool_alloc({ i64, i32 } %state, i64 12), !dbg !138
   %5 = icmp sgt i64 %.fca.2.0.extract, 0
   %6 = select i1 %5, i64 %.fca.2.0.extract, i64 0
   %7 = bitcast i8 addrspace(1)* %.fca.0.extract to float addrspace(1)*
; └
....
L36:                                              ; preds = %L20.postloop
  %13 = getelementptr inbounds [1 x i64], [1 x i64]* %3, i64 0, i64 0
;  @ REPL[4]:6 within `demo`
; ┌ @ /home/vchuravy/.julia/packages/StaticArrays/EHHaF/src/MArray.jl:31 within `setindex!`
; │┌ @ abstractarray.jl:707 within `checkbounds`
    store i64 %value_phi.postloop, i64* %13, align 8, !dbg !186, !tbaa !152, !alias.scope !156, !noalias !159
; ││ @ abstractarray.jl:709 within `checkbounds`
    call fastcc void @julia__throw_boundserror_2854({ i64, i32 } %state), !dbg !188
    call void @llvm.trap(), !dbg !188
    call void asm sideeffect "exit;", ""(), !dbg !188
    unreachable, !dbg !188

In both cases dead-arg elimination (which runs very late) removed the actual argument to the CUDA.jl specific throw_boundserror quirk,

In both cases what remains is a gpu_gc_pool_allocate, but on 1.9 we didn't insert the corresponding GC push_frame ops.

Okay JuliaGPU/GPUCompiler.jl@cf6bcab fixes this case. Still not ideal since we have a gpu_alloc, but it does address the regression.

Note the PR is against current GPUCompiler, but locally I had backported it to 0.25