JuliaGPU / CUDA.jl

CUDA programming in Julia.

Home Page:https://juliagpu.org/cuda/

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

cuBLASXt's `xt_gemm!` incompatible with stream-ordered allocated memory

lpawela opened this issue · comments

If your bug is still valid, please go ahead and fill out the template below.

Describe the bug

After launching the second part of the MWE I get the following error

ERROR: LoadError: CUBLASError: an access to GPU memory space failed (code 11, CUBLAS_STATUS_MAPPING_ERROR)
Stacktrace:
 [1] throw_api_error(res::CUDA.CUBLAS.cublasStatus_t)
   @ CUDA.CUBLAS ~/.julia/packages/CUDA/htRwP/lib/cublas/libcublas.jl:11
 [2] check
   @ ~/.julia/packages/CUDA/htRwP/lib/cublas/libcublas.jl:21 [inlined]
 [3] cublasXtSgemm
   @ ~/.julia/packages/CUDA/htRwP/lib/utils/call.jl:26 [inlined]
 [4] xt_gemm!(transA::Char, transB::Char, alpha::Int64, A::Matrix{Float32}, B::CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, beta::Int64, C::CuArray{Float32, 2, CUDA.Mem.DeviceBuffer})
   @ CUDA.CUBLAS ~/.julia/packages/CUDA/htRwP/lib/cublas/wrappers.jl:1932
 [5] top-level scope
   @ ~/mwe_xt_gemm/mwe.jl:16
in expression starting at /home/lpawela/mwe_xt_gemm/mwe.jl:16

To reproduce

The Minimal Working Example (MWE) for this bug:

using CUDA
using LinearAlgebra

a = rand(Float32, 1000, 1000)
b = CUDA.rand(1000, 100)
c = CUDA.zeros(1000, 100)

CUBLAS.xt_gemm!('N', 'N', 1, a, b, 0, c) # WORKS
@assert isapprox(norm(a * Array(b) - Array(c)), 0; atol=0.1) # WORKS
println("First test passed")

a = rand(Float32, 100_000, 100_000)
b = CUDA.rand(100_000, 10_000)
c = CUDA.zeros(100_000, 10_000)

CUBLAS.xt_gemm!('N', 'N', 1, a, b, 0, c) # FAILS
Manifest.toml

# This file is machine-generated - editing it directly is not advised

julia_version = "1.10.2"
manifest_format = "2.0"
project_hash = "4d6f52122ea9741175c8c71d00021be0a921f3ad"

[[deps.AbstractFFTs]]
deps = ["LinearAlgebra"]
git-tree-sha1 = "d92ad398961a3ed262d8bf04a1a2b8340f915fef"
uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c"
version = "1.5.0"

    [deps.AbstractFFTs.extensions]
    AbstractFFTsChainRulesCoreExt = "ChainRulesCore"
    AbstractFFTsTestExt = "Test"

    [deps.AbstractFFTs.weakdeps]
    ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
    Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[[deps.Adapt]]
deps = ["LinearAlgebra", "Requires"]
git-tree-sha1 = "6a55b747d1812e699320963ffde36f1ebdda4099"
uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
version = "4.0.4"
weakdeps = ["StaticArrays"]

    [deps.Adapt.extensions]
    AdaptStaticArraysExt = "StaticArrays"

[[deps.ArgTools]]
uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f"
version = "1.1.1"

[[deps.Artifacts]]
uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"

[[deps.Atomix]]
deps = ["UnsafeAtomics"]
git-tree-sha1 = "c06a868224ecba914baa6942988e2f2aade419be"
uuid = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
version = "0.1.0"

[[deps.BFloat16s]]
deps = ["LinearAlgebra", "Printf", "Random", "Test"]
git-tree-sha1 = "2c7cc21e8678eff479978a0a2ef5ce2f51b63dff"
uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b"
version = "0.5.0"

[[deps.Base64]]
uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"

[[deps.CEnum]]
git-tree-sha1 = "389ad5c84de1ae7cf0e28e381131c98ea87d54fc"
uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82"
version = "0.5.0"

[[deps.CUDA]]
deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CUDA_Driver_jll", "CUDA_Runtime_Discovery", "CUDA_Runtime_jll", "Crayons", "DataFrames", "ExprTools", "GPUArrays", "GPUCompiler", "KernelAbstractions", "LLVM", "LLVMLoopInfo", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "NVTX", "Preferences", "PrettyTables", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "StaticArrays", "Statistics"]
git-tree-sha1 = "baa8ea7a1ea63316fa3feb454635215773c9c845"
uuid = "052768ef-5323-5732-b1bb-66c8b64840ba"
version = "5.2.0"

    [deps.CUDA.extensions]
    ChainRulesCoreExt = "ChainRulesCore"
    SpecialFunctionsExt = "SpecialFunctions"

    [deps.CUDA.weakdeps]
    ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
    SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"

[[deps.CUDA_Driver_jll]]
deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "Pkg"]
git-tree-sha1 = "d01bfc999768f0a31ed36f5d22a76161fc63079c"
uuid = "4ee394cb-3365-5eb0-8335-949819d2adfc"
version = "0.7.0+1"

[[deps.CUDA_Runtime_Discovery]]
deps = ["Libdl"]
git-tree-sha1 = "38f830504358e9972d2a0c3e5d51cb865e0733df"
uuid = "1af6417a-86b4-443c-805f-a4643ffb695f"
version = "0.2.4"

[[deps.CUDA_Runtime_jll]]
deps = ["Artifacts", "CUDA_Driver_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"]
git-tree-sha1 = "8e25c009d2bf16c2c31a70a6e9e8939f7325cc84"
uuid = "76a88914-d11a-5bdc-97e0-2f5a05c973a2"
version = "0.11.1+0"

[[deps.ColorTypes]]
deps = ["FixedPointNumbers", "Random"]
git-tree-sha1 = "b10d0b65641d57b8b4d5e234446582de5047050d"
uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f"
version = "0.11.5"

[[deps.Colors]]
deps = ["ColorTypes", "FixedPointNumbers", "Reexport"]
git-tree-sha1 = "fc08e5930ee9a4e03f84bfb5211cb54e7769758a"
uuid = "5ae59095-9a9b-59fe-a467-6f913c188581"
version = "0.12.10"

[[deps.Compat]]
deps = ["TOML", "UUIDs"]
git-tree-sha1 = "c955881e3c981181362ae4088b35995446298b80"
uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
version = "4.14.0"
weakdeps = ["Dates", "LinearAlgebra"]

    [deps.Compat.extensions]
    CompatLinearAlgebraExt = "LinearAlgebra"

[[deps.CompilerSupportLibraries_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
version = "1.1.0+0"

[[deps.Crayons]]
git-tree-sha1 = "249fe38abf76d48563e2f4556bebd215aa317e15"
uuid = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f"
version = "4.1.1"

[[deps.DataAPI]]
git-tree-sha1 = "abe83f3a2f1b857aac70ef8b269080af17764bbe"
uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
version = "1.16.0"

[[deps.DataFrames]]
deps = ["Compat", "DataAPI", "DataStructures", "Future", "InlineStrings", "InvertedIndices", "IteratorInterfaceExtensions", "LinearAlgebra", "Markdown", "Missings", "PooledArrays", "PrecompileTools", "PrettyTables", "Printf", "REPL", "Random", "Reexport", "SentinelArrays", "SortingAlgorithms", "Statistics", "TableTraits", "Tables", "Unicode"]
git-tree-sha1 = "04c738083f29f86e62c8afc341f0967d8717bdb8"
uuid = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
version = "1.6.1"

[[deps.DataStructures]]
deps = ["Compat", "InteractiveUtils", "OrderedCollections"]
git-tree-sha1 = "0f4b5d62a88d8f59003e43c25a8a90de9eb76317"
uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
version = "0.18.18"

[[deps.DataValueInterfaces]]
git-tree-sha1 = "bfc1187b79289637fa0ef6d4436ebdfe6905cbd6"
uuid = "e2d170a0-9d28-54be-80f0-106bbe20a464"
version = "1.0.0"

[[deps.Dates]]
deps = ["Printf"]
uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"

[[deps.Downloads]]
deps = ["ArgTools", "FileWatching", "LibCURL", "NetworkOptions"]
uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
version = "1.6.0"

[[deps.ExprTools]]
git-tree-sha1 = "27415f162e6028e81c72b82ef756bf321213b6ec"
uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
version = "0.1.10"

[[deps.FileWatching]]
uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee"

[[deps.FixedPointNumbers]]
deps = ["Statistics"]
git-tree-sha1 = "335bfdceacc84c5cdf16aadc768aa5ddfc5383cc"
uuid = "53c48c17-4a7d-5ca2-90c5-79b7896eea93"
version = "0.8.4"

[[deps.Future]]
deps = ["Random"]
uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820"

[[deps.GPUArrays]]
deps = ["Adapt", "GPUArraysCore", "LLVM", "LinearAlgebra", "Printf", "Random", "Reexport", "Serialization", "Statistics"]
git-tree-sha1 = "68e8ff56a4a355a85d2784b94614491f8c900cde"
uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
version = "10.1.0"

[[deps.GPUArraysCore]]
deps = ["Adapt"]
git-tree-sha1 = "ec632f177c0d990e64d955ccc1b8c04c485a0950"
uuid = "46192b85-c4d5-4398-a991-12ede77f4527"
version = "0.1.6"

[[deps.GPUCompiler]]
deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Scratch", "TimerOutputs", "UUIDs"]
git-tree-sha1 = "a846f297ce9d09ccba02ead0cae70690e072a119"
uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
version = "0.25.0"

[[deps.InlineStrings]]
deps = ["Parsers"]
git-tree-sha1 = "9cc2baf75c6d09f9da536ddf58eb2f29dedaf461"
uuid = "842dd82b-1e85-43dc-bf29-5d0ee9dffc48"
version = "1.4.0"

[[deps.InteractiveUtils]]
deps = ["Markdown"]
uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"

[[deps.InvertedIndices]]
git-tree-sha1 = "0dc7b50b8d436461be01300fd8cd45aa0274b038"
uuid = "41ab1584-1d38-5bbf-9106-f11c6c58b48f"
version = "1.3.0"

[[deps.IteratorInterfaceExtensions]]
git-tree-sha1 = "a3f24677c21f5bbe9d2a714f95dcd58337fb2856"
uuid = "82899510-4779-5014-852e-03e436cf321d"
version = "1.0.0"

[[deps.JLLWrappers]]
deps = ["Artifacts", "Preferences"]
git-tree-sha1 = "7e5d6779a1e09a36db2a7b6cff50942a0a7d0fca"
uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210"
version = "1.5.0"

[[deps.JuliaNVTXCallbacks_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "af433a10f3942e882d3c671aacb203e006a5808f"
uuid = "9c1d0b0a-7046-5b2e-a33f-ea22f176ac7e"
version = "0.2.1+0"

[[deps.KernelAbstractions]]
deps = ["Adapt", "Atomix", "InteractiveUtils", "LinearAlgebra", "MacroTools", "PrecompileTools", "Requires", "SparseArrays", "StaticArrays", "UUIDs", "UnsafeAtomics", "UnsafeAtomicsLLVM"]
git-tree-sha1 = "ed7167240f40e62d97c1f5f7735dea6de3cc5c49"
uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
version = "0.9.18"

    [deps.KernelAbstractions.extensions]
    EnzymeExt = "EnzymeCore"

    [deps.KernelAbstractions.weakdeps]
    EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"

[[deps.LLVM]]
deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Preferences", "Printf", "Requires", "Unicode"]
git-tree-sha1 = "839c82932db86740ae729779e610f07a1640be9a"
uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
version = "6.6.3"
weakdeps = ["BFloat16s"]

    [deps.LLVM.extensions]
    BFloat16sExt = "BFloat16s"

[[deps.LLVMExtra_jll]]
deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"]
git-tree-sha1 = "88b916503aac4fb7f701bb625cd84ca5dd1677bc"
uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab"
version = "0.0.29+0"

[[deps.LLVMLoopInfo]]
git-tree-sha1 = "2e5c102cfc41f48ae4740c7eca7743cc7e7b75ea"
uuid = "8b046642-f1f6-4319-8d3c-209ddc03c586"
version = "1.0.0"

[[deps.LaTeXStrings]]
git-tree-sha1 = "50901ebc375ed41dbf8058da26f9de442febbbec"
uuid = "b964fa9f-0449-5b57-a5c2-d3ea65f4040f"
version = "1.3.1"

[[deps.LazyArtifacts]]
deps = ["Artifacts", "Pkg"]
uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3"

[[deps.LibCURL]]
deps = ["LibCURL_jll", "MozillaCACerts_jll"]
uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21"
version = "0.6.4"

[[deps.LibCURL_jll]]
deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"]
uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0"
version = "8.4.0+0"

[[deps.LibGit2]]
deps = ["Base64", "LibGit2_jll", "NetworkOptions", "Printf", "SHA"]
uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"

[[deps.LibGit2_jll]]
deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll"]
uuid = "e37daf67-58a4-590a-8e99-b0245dd2ffc5"
version = "1.6.4+0"

[[deps.LibSSH2_jll]]
deps = ["Artifacts", "Libdl", "MbedTLS_jll"]
uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8"
version = "1.11.0+1"

[[deps.Libdl]]
uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"

[[deps.LinearAlgebra]]
deps = ["Libdl", "OpenBLAS_jll", "libblastrampoline_jll"]
uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"

[[deps.Logging]]
uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"

[[deps.MacroTools]]
deps = ["Markdown", "Random"]
git-tree-sha1 = "2fa9ee3e63fd3a4f7a9a4f4744a52f4856de82df"
uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
version = "0.5.13"

[[deps.Markdown]]
deps = ["Base64"]
uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"

[[deps.MbedTLS_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1"
version = "2.28.2+1"

[[deps.Missings]]
deps = ["DataAPI"]
git-tree-sha1 = "ec4f7fbeab05d7747bdf98eb74d130a2a2ed298d"
uuid = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
version = "1.2.0"

[[deps.MozillaCACerts_jll]]
uuid = "14a3606d-f60d-562e-9121-12d972cd8159"
version = "2023.1.10"

[[deps.NVTX]]
deps = ["Colors", "JuliaNVTXCallbacks_jll", "Libdl", "NVTX_jll"]
git-tree-sha1 = "53046f0483375e3ed78e49190f1154fa0a4083a1"
uuid = "5da4648a-3479-48b8-97b9-01cb529c0a1f"
version = "0.3.4"

[[deps.NVTX_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "ce3269ed42816bf18d500c9f63418d4b0d9f5a3b"
uuid = "e98f9f5b-d649-5603-91fd-7774390e6439"
version = "3.1.0+2"

[[deps.NetworkOptions]]
uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908"
version = "1.2.0"

[[deps.OpenBLAS_jll]]
deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"]
uuid = "4536629a-c528-5b80-bd46-f80d51c5b363"
version = "0.3.23+4"

[[deps.OrderedCollections]]
git-tree-sha1 = "dfdf5519f235516220579f949664f1bf44e741c5"
uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
version = "1.6.3"

[[deps.Parsers]]
deps = ["Dates", "PrecompileTools", "UUIDs"]
git-tree-sha1 = "8489905bcdbcfac64d1daa51ca07c0d8f0283821"
uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
version = "2.8.1"

[[deps.Pkg]]
deps = ["Artifacts", "Dates", "Downloads", "FileWatching", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"]
uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
version = "1.10.0"

[[deps.PooledArrays]]
deps = ["DataAPI", "Future"]
git-tree-sha1 = "36d8b4b899628fb92c2749eb488d884a926614d3"
uuid = "2dfb63ee-cc39-5dd5-95bd-886bf059d720"
version = "1.4.3"

[[deps.PrecompileTools]]
deps = ["Preferences"]
git-tree-sha1 = "5aa36f7049a63a1528fe8f7c3f2113413ffd4e1f"
uuid = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
version = "1.2.1"

[[deps.Preferences]]
deps = ["TOML"]
git-tree-sha1 = "9306f6085165d270f7e3db02af26a400d580f5c6"
uuid = "21216c6a-2e73-6563-6e65-726566657250"
version = "1.4.3"

[[deps.PrettyTables]]
deps = ["Crayons", "LaTeXStrings", "Markdown", "PrecompileTools", "Printf", "Reexport", "StringManipulation", "Tables"]
git-tree-sha1 = "88b895d13d53b5577fd53379d913b9ab9ac82660"
uuid = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d"
version = "2.3.1"

[[deps.Printf]]
deps = ["Unicode"]
uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"

[[deps.REPL]]
deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"]
uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"

[[deps.Random]]
deps = ["SHA"]
uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"

[[deps.Random123]]
deps = ["Random", "RandomNumbers"]
git-tree-sha1 = "4743b43e5a9c4a2ede372de7061eed81795b12e7"
uuid = "74087812-796a-5b5d-8853-05524746bad3"
version = "1.7.0"

[[deps.RandomNumbers]]
deps = ["Random", "Requires"]
git-tree-sha1 = "043da614cc7e95c703498a491e2c21f58a2b8111"
uuid = "e6cf234a-135c-5ec9-84dd-332b85af5143"
version = "1.5.3"

[[deps.Reexport]]
git-tree-sha1 = "45e428421666073eab6f2da5c9d310d99bb12f9b"
uuid = "189a3867-3050-52da-a836-e630ba90ab69"
version = "1.2.2"

[[deps.Requires]]
deps = ["UUIDs"]
git-tree-sha1 = "838a3a4188e2ded87a4f9f184b4b0d78a1e91cb7"
uuid = "ae029012-a4dd-5104-9daa-d747884805df"
version = "1.3.0"

[[deps.SHA]]
uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
version = "0.7.0"

[[deps.Scratch]]
deps = ["Dates"]
git-tree-sha1 = "3bac05bc7e74a75fd9cba4295cde4045d9fe2386"
uuid = "6c6a2e73-6563-6170-7368-637461726353"
version = "1.2.1"

[[deps.SentinelArrays]]
deps = ["Dates", "Random"]
git-tree-sha1 = "0e7508ff27ba32f26cd459474ca2ede1bc10991f"
uuid = "91c51154-3ec4-41a3-a24f-3f23e20d615c"
version = "1.4.1"

[[deps.Serialization]]
uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"

[[deps.Sockets]]
uuid = "6462fe0b-24de-5631-8697-dd941f90decc"

[[deps.SortingAlgorithms]]
deps = ["DataStructures"]
git-tree-sha1 = "66e0a8e672a0bdfca2c3f5937efb8538b9ddc085"
uuid = "a2af1166-a08f-5f64-846c-94a0d3cef48c"
version = "1.2.1"

[[deps.SparseArrays]]
deps = ["Libdl", "LinearAlgebra", "Random", "Serialization", "SuiteSparse_jll"]
uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
version = "1.10.0"

[[deps.StaticArrays]]
deps = ["LinearAlgebra", "PrecompileTools", "Random", "StaticArraysCore"]
git-tree-sha1 = "bf074c045d3d5ffd956fa0a461da38a44685d6b2"
uuid = "90137ffa-7385-5640-81b9-e52037218182"
version = "1.9.3"

    [deps.StaticArrays.extensions]
    StaticArraysChainRulesCoreExt = "ChainRulesCore"
    StaticArraysStatisticsExt = "Statistics"

    [deps.StaticArrays.weakdeps]
    ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
    Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"

[[deps.StaticArraysCore]]
git-tree-sha1 = "36b3d696ce6366023a0ea192b4cd442268995a0d"
uuid = "1e83bf80-4336-4d27-bf5d-d5a4f845583c"
version = "1.4.2"

[[deps.Statistics]]
deps = ["LinearAlgebra", "SparseArrays"]
uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
version = "1.10.0"

[[deps.StringManipulation]]
deps = ["PrecompileTools"]
git-tree-sha1 = "a04cabe79c5f01f4d723cc6704070ada0b9d46d5"
uuid = "892a3eda-7b42-436c-8928-eab12a02cf0e"
version = "0.3.4"

[[deps.SuiteSparse_jll]]
deps = ["Artifacts", "Libdl", "libblastrampoline_jll"]
uuid = "bea87d4a-7f5b-5778-9afe-8cc45184846c"
version = "7.2.1+1"

[[deps.TOML]]
deps = ["Dates"]
uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
version = "1.0.3"

[[deps.TableTraits]]
deps = ["IteratorInterfaceExtensions"]
git-tree-sha1 = "c06b2f539df1c6efa794486abfb6ed2022561a39"
uuid = "3783bdb8-4a98-5b6b-af9a-565f29a5fe9c"
version = "1.0.1"

[[deps.Tables]]
deps = ["DataAPI", "DataValueInterfaces", "IteratorInterfaceExtensions", "LinearAlgebra", "OrderedCollections", "TableTraits"]
git-tree-sha1 = "cb76cf677714c095e535e3501ac7954732aeea2d"
uuid = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
version = "1.11.1"

[[deps.Tar]]
deps = ["ArgTools", "SHA"]
uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
version = "1.10.0"

[[deps.Test]]
deps = ["InteractiveUtils", "Logging", "Random", "Serialization"]
uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[[deps.TimerOutputs]]
deps = ["ExprTools", "Printf"]
git-tree-sha1 = "f548a9e9c490030e545f72074a41edfd0e5bcdd7"
uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
version = "0.5.23"

[[deps.UUIDs]]
deps = ["Random", "SHA"]
uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"

[[deps.Unicode]]
uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"

[[deps.UnsafeAtomics]]
git-tree-sha1 = "6331ac3440856ea1988316b46045303bef658278"
uuid = "013be700-e6cd-48c3-b4a1-df204f14c38f"
version = "0.2.1"

[[deps.UnsafeAtomicsLLVM]]
deps = ["LLVM", "UnsafeAtomics"]
git-tree-sha1 = "323e3d0acf5e78a56dfae7bd8928c989b4f3083e"
uuid = "d80eeb9a-aca5-4d75-85e5-170c8b632249"
version = "0.1.3"

[[deps.Zlib_jll]]
deps = ["Libdl"]
uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
version = "1.2.13+1"

[[deps.libblastrampoline_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "8e850b90-86db-534c-a0d3-1478176c7d93"
version = "5.8.0+1"

[[deps.nghttp2_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d"
version = "1.52.0+1"

[[deps.p7zip_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0"
version = "17.4.0+2"

Expected behavior

The second multiplication also passes.

Version info

Details on Julia:

Julia Version 1.10.2
Commit bd47eca2c8a (2024-03-01 10:14 UTC)
Build Info:
  Official https://julialang.org/ release
Platform Info:
  OS: Linux (x86_64-linux-gnu)
  CPU: 128 × Intel(R) Xeon(R) Platinum 8462Y+
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-15.0.7 (ORCJIT, sapphirerapids)
Threads: 128 default, 0 interactive, 64 GC (on 128 virtual cores)

Details on CUDA:

CUDA runtime 12.3, artifact installation
CUDA driver 12.4
NVIDIA driver 550.54.15

CUDA libraries: 
- CUBLAS: 12.3.4
- CURAND: 10.3.4
- CUFFT: 11.0.12
- CUSOLVER: 11.5.4
- CUSPARSE: 12.2.0
- CUPTI: 21.0.0
- NVML: 12.0.0+550.54.15

Julia packages: 
- CUDA: 5.2.0
- CUDA_Driver_jll: 0.7.0+1
- CUDA_Runtime_jll: 0.11.1+0

Toolchain:
- Julia: 1.10.2
- LLVM: 15.0.7

4 devices:
  0: NVIDIA H100 (sm_90, 92.999 GiB / 93.584 GiB available)
  1: NVIDIA H100 (sm_90, 92.999 GiB / 93.584 GiB available)
  2: NVIDIA H100 (sm_90, 92.999 GiB / 93.584 GiB available)
  3: NVIDIA H100 (sm_90, 92.999 GiB / 93.584 GiB available)

Additional context

Similar and larger examples (up to 500k) work in C++ on the same setup.

Works for me.

Similar and larger examples (up to 500k) work in C++ on the same setup.

What version of the CUDA toolkit is that using? Since you're using a CUDA 12.4 driver, I assume that might be the CUDA 12.4 toolkit. If so, try CUDA.jl#master to also use CUDA 12.4 for the Julia test.

On master I still get the same error.

CUDA runtime 12.4, artifact installation
CUDA driver 12.4
NVIDIA driver 550.54.15

CUDA libraries: 
- CUBLAS: 12.4.2
- CURAND: 10.3.5
- CUFFT: 11.2.0
- CUSOLVER: 11.6.0
- CUSPARSE: 12.3.0
- CUPTI: 22.0.0
- NVML: 12.0.0+550.54.15

Julia packages: 
- CUDA: 5.3.0
- CUDA_Driver_jll: 0.8.0+0
- CUDA_Runtime_jll: 0.12.0+1

Toolchain:
- Julia: 1.10.2
- LLVM: 15.0.7

4 devices:
  0: NVIDIA H100 (sm_90, 92.999 GiB / 93.584 GiB available)
  1: NVIDIA H100 (sm_90, 92.999 GiB / 93.584 GiB available)
  2: NVIDIA H100 (sm_90, 92.999 GiB / 93.584 GiB available)
  3: NVIDIA H100 (sm_90, 92.999 GiB / 93.584 GiB available)

I also tried on a different machine (both 5.2 and master, versioninfo below). The error still happens.

5.2

CUDA runtime 12.3, artifact installation
CUDA driver 12.3
NVIDIA driver 545.23.6

CUDA libraries: 
- CUBLAS: 12.3.4
- CURAND: 10.3.4
- CUFFT: 11.0.12
- CUSOLVER: 11.5.4
- CUSPARSE: 12.2.0
- CUPTI: 21.0.0
- NVML: 12.0.0+545.23.6

Julia packages: 
- CUDA: 5.2.0
- CUDA_Driver_jll: 0.7.0+1
- CUDA_Runtime_jll: 0.11.1+0

Toolchain:
- Julia: 1.10.2
- LLVM: 15.0.7

2 devices:
  0: NVIDIA RTX A6000 (sm_86, 44.548 GiB / 44.988 GiB available)
  1: NVIDIA RTX A6000 (sm_86, 44.548 GiB / 44.988 GiB available)

master

CUDA runtime 12.4, artifact installation
CUDA driver 12.3
NVIDIA driver 545.23.6

CUDA libraries: 
- CUBLAS: 12.4.2
- CURAND: 10.3.5
- CUFFT: 11.2.0
- CUSOLVER: 11.6.0
- CUSPARSE: 12.3.0
- CUPTI: 22.0.0
- NVML: 12.0.0+545.23.6

Julia packages: 
- CUDA: 5.3.0
- CUDA_Driver_jll: 0.8.0+0
- CUDA_Runtime_jll: 0.12.0+1

Toolchain:
- Julia: 1.10.2
- LLVM: 15.0.7

2 devices:
  0: NVIDIA RTX A6000 (sm_86, 44.548 GiB / 44.988 GiB available)
  1: NVIDIA RTX A6000 (sm_86, 44.548 GiB / 44.988 GiB available)

That's surprising, because I also have an RTX A6000. I can reproduce on an H100 though.

Can you share your C++ reproducer?

Also, can you try running with JULIA_DEBUG=CUBLAS?

On the H100 machine on master

09:32:08  |base|lpawela@nirvana mwe_xt_gemm  JULIA_DEBUG=CUBLAS julia --project -t 128 mwe.jl 
First test passed
┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasXtCreate(cublasXtContext**) called:
│   handle: type=SOME TYPE; val=POINTER (IN HEX:0x0x7f9324ac5dc0)
│  Time: 2024-04-10T21:32:37 elapsed from start 0.100000 minutes or 6.000000 seconds
│ Process=414562; Thread=140270086828608; GPU=0; Handle=POINTER (IN HEX:0x(nil))
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
ERROR: ┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasXtDeviceSelect(cublasXtHandle_t, int, int*) called:
│   handle: type=SOME TYPE; val=POINTER (IN HEX:0x0x618e950)
│   nbDevices: type=int; val=4
│   deviceId: type=int; val=POINTER (IN HEX:0x0x7f9321955260)
│  Time: 2024-04-10T21:32:37 elapsed from start 0.100000 minutes or 6.000000 seconds
│ Process=414562; Thread=140270086828608; GPU=0; Handle=POINTER (IN HEX:0x(nil))
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
LoadError: ┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasCreate_v2(cublasContext**) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x61cbcd0)
│  Time: 2024-04-10T21:32:37 elapsed from start 0.100000 minutes or 6.000000 seconds
│ Process=414562; Thread=140270086828608; GPU=0; Handle=POINTER (IN HEX:0x(nil))
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
CUBLASError: ┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasCreate_v2(cublasContext**) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x61cbcd8)
│  Time: 2024-04-10T21:32:37 elapsed from start 0.100000 minutes or 6.000000 seconds
│ Process=414562; Thread=140270086828608; GPU=1; Handle=POINTER (IN HEX:0x(nil))
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
an access to GPU memory space failed┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasCreate_v2(cublasContext**) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x61cbce0)
│  Time: 2024-04-10T21:32:37 elapsed from start 0.100000 minutes or 6.000000 seconds
│ Process=414562; Thread=140270086828608; GPU=2; Handle=POINTER (IN HEX:0x(nil))
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
 (code 11, CUBLAS_STATUS_MAPPING_ERROR)┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasCreate_v2(cublasContext**) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x61cbce8)
│  Time: 2024-04-10T21:32:37 elapsed from start 0.100000 minutes or 6.000000 seconds
│ Process=414562; Thread=140270086828608; GPU=3; Handle=POINTER (IN HEX:0x(nil))
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224

Stacktrace:┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasXtSgemm(cublasXtHandle_t, cublasOperation_t, cublasOperation_t, size_t, size_t, size_t, const float*, const float*, size_t, const float*, size_t, const float*, float*, size_t) called:
│   handle: type=SOME TYPE; val=POINTER (IN HEX:0x0x618e950)
│   transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   m: type=SOME TYPE; val=1000
│   n: type=SOME TYPE; val=100
│   k: type=SOME TYPE; val=1000
│   alpha: type=float; val=POINTER (IN HEX:0x0x7ffd86ce35f8)
│   A: type=float; val=POINTER (IN HEX:0x0x7f8c6c999040)
│   lda: type=SOME TYPE; val=1000
│   B: type=float; val=POINTER (IN HEX:0x0x620000000)
│   ldb: type=SOME TYPE; val=1000
│   beta: type=float; val=POINTER (IN HEX:0x0x7ffd86ce35f0)
│   C: type=float; val=POINTER (IN HEX:0x0x620061c00)
│   ldc: type=SOME TYPE; val=1000
│  Time: 2024-04-10T21:32:37 elapsed from start 0.100000 minutes or 6.000000 seconds
│ Process=414562; Thread=140270086828608; GPU=0; Handle=POINTER (IN HEX:0x(nil))
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224

┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasSetStream_v2(cublasHandle_t, cudaStream_t) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│   streamId: type=SOME TYPE; val=POINTER (IN HEX:0x0x7f899c0050f0)
│  Time: 2024-04-10T21:32:37 elapsed from start 0.100000 minutes or 6.000000 seconds
│ Process=414562; Thread=140230656906816; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x(nil)) (defaultStream); MathMode=CUBLAS_DEFAULT_MATH
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
 ┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasSgemm_v2(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float*, const float*, int, const float*, int, const float*, float*, int) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│   transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   m: type=int; val=1000
│   n: type=int; val=100
│   k: type=int; val=1000
│   alpha: type=float; val=POINTER (IN HEX:0x0x7f89fe7dca38)
│   A: type=float; val=POINTER (IN HEX:0x0x7f89d1200000)
│   lda: type=int; val=1024
│   B: type=float; val=POINTER (IN HEX:0x0x620000000)
│   ldb: type=int; val=1000
│   beta: type=float; val=POINTER (IN HEX:0x0x7f89fe7dca34)
│   C: type=float; val=POINTER (IN HEX:0x0x620061c00)
│   ldc: type=int; val=1000
│  Time: 2024-04-10T21:32:37 elapsed from start 0.100000 minutes or 6.000000 seconds
│ Process=414562; Thread=140230656906816; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899c0050f0); MathMode=CUBLAS_DEFAULT_MATH
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
│ 
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
[1]┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasXtSgemm(cublasXtHandle_t, cublasOperation_t, cublasOperation_t, size_t, size_t, size_t, const float*, const float*, size_t, const float*, size_t, const float*, float*, size_t) called:
│   handle: type=SOME TYPE; val=POINTER (IN HEX:0x0x618e950)
│   transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   m: type=SOME TYPE; val=100000
│   n: type=SOME TYPE; val=10000
│   k: type=SOME TYPE; val=100000
│   alpha: type=float; val=POINTER (IN HEX:0x0x7ffd86ce35f8)
│   A: type=float; val=POINTER (IN HEX:0x0x7f8030f62040)
│   lda: type=SOME TYPE; val=100000
│   B: type=float; val=POINTER (IN HEX:0x0x6200c3800)
│   ldb: type=SOME TYPE; val=100000
│   beta: type=float; val=POINTER (IN HEX:0x0x7ffd86ce35f0)
│   C: type=float; val=POINTER (IN HEX:0x0x70e776000)
│   ldc: type=SOME TYPE; val=100000
│  Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140270086828608; GPU=0; Handle=POINTER (IN HEX:0x(nil))
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
 ┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasSetStream_v2(cublasHandle_t, cudaStream_t) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│   streamId: type=SOME TYPE; val=POINTER (IN HEX:0x0x7f899f21b480)
│  Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899c0050f0); MathMode=CUBLAS_DEFAULT_MATH
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
throw_api_error(res::CUDA.CUBLAS.cublasStatus_t)┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasSgemm_v2(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float*, const float*, int, const float*, int, const float*, float*, int) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│   transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   m: type=int; val=1024
│   n: type=int; val=1024
│   k: type=int; val=1024
│   alpha: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a38)
│   A: type=float; val=POINTER (IN HEX:0x0x7f898a400000)
│   lda: type=int; val=1024
│   B: type=float; val=POINTER (IN HEX:0x0x6200c3800)
│   ldb: type=int; val=100000
│   beta: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a34)
│   C: type=float; val=POINTER (IN HEX:0x0x70e776000)
│   ldc: type=int; val=100000
│  Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899f21b480); MathMode=CUBLAS_DEFAULT_MATH
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224

┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasSetStream_v2(cublasHandle_t, cudaStream_t) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│   streamId: type=SOME TYPE; val=POINTER (IN HEX:0x0x7f899c3acd00)
│  Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899f21b480); MathMode=CUBLAS_DEFAULT_MATH
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
   @┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasSgemm_v2(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float*, const float*, int, const float*, int, const float*, float*, int) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│   transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   m: type=int; val=1024
│   n: type=int; val=1024
│   k: type=int; val=1024
│   alpha: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a38)
│   A: type=float; val=POINTER (IN HEX:0x0x7f8993a00000)
│   lda: type=int; val=1024
│   B: type=float; val=POINTER (IN HEX:0x0x6200c4800)
│   ldb: type=int; val=100000
│   beta: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a3c)
│   C: type=float; val=POINTER (IN HEX:0x0x70e776000)
│   ldc: type=int; val=100000
│  Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899c3acd00); MathMode=CUBLAS_DEFAULT_MATH
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
 ┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasSetStream_v2(cublasHandle_t, cudaStream_t) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│   streamId: type=SOME TYPE; val=POINTER (IN HEX:0x0x7f899f21b480)
│  Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899c3acd00); MathMode=CUBLAS_DEFAULT_MATH
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
CUDA.CUBLAS┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasSgemm_v2(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float*, const float*, int, const float*, int, const float*, float*, int) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│   transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   m: type=int; val=1024
│   n: type=int; val=1024
│   k: type=int; val=1024
│   alpha: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a38)
│   A: type=float; val=POINTER (IN HEX:0x0x7f898a400000)
│   lda: type=int; val=1024
│   B: type=float; val=POINTER (IN HEX:0x0x6200c5800)
│   ldb: type=int; val=100000
│   beta: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a3c)
│   C: type=float; val=POINTER (IN HEX:0x0x70e776000)
│   ldc: type=int; val=100000
│  Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899f21b480); MathMode=CUBLAS_DEFAULT_MATH
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
 ┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasSetStream_v2(cublasHandle_t, cudaStream_t) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│   streamId: type=SOME TYPE; val=POINTER (IN HEX:0x0x7f899c3acd00)
│  Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899f21b480); MathMode=CUBLAS_DEFAULT_MATH
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
~/.julia/packages/CUDA/fGE8R/lib/cublas/┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasSgemm_v2(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float*, const float*, int, const float*, int, const float*, float*, int) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│   transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   m: type=int; val=1024
│   n: type=int; val=1024
│   k: type=int; val=1024
│   alpha: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a38)
│   A: type=float; val=POINTER (IN HEX:0x0x7f8993a00000)
│   lda: type=int; val=1024
│   B: type=float; val=POINTER (IN HEX:0x0x6200c6800)
│   ldb: type=int; val=100000
│   beta: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a3c)
│   C: type=float; val=POINTER (IN HEX:0x0x70e776000)
│   ldc: type=int; val=100000
│  Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899c3acd00); MathMode=CUBLAS_DEFAULT_MATH
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
libcublas.jl:14┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasSetStream_v2(cublasHandle_t, cudaStream_t) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│   streamId: type=SOME TYPE; val=POINTER (IN HEX:0x0x7f899f21b480)
│  Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899c3acd00); MathMode=CUBLAS_DEFAULT_MATH
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasSgemm_v2(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float*, const float*, int, const float*, int, const float*, float*, int) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│   transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   m: type=int; val=1024
│   n: type=int; val=1024
│   k: type=int; val=1024
│   alpha: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a38)
│   A: type=float; val=POINTER (IN HEX:0x0x7f898a400000)
│   lda: type=int; val=1024
│   B: type=float; val=POINTER (IN HEX:0x0x6200c7800)
│   ldb: type=int; val=100000
│   beta: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a3c)
│   C: type=float; val=POINTER (IN HEX:0x0x70e776000)
│   ldc: type=int; val=100000
│  Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899f21b480); MathMode=CUBLAS_DEFAULT_MATH
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224

┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasSetStream_v2(cublasHandle_t, cudaStream_t) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│   streamId: type=SOME TYPE; val=POINTER (IN HEX:0x0x7f899c3acd00)
│  Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899f21b480); MathMode=CUBLAS_DEFAULT_MATH
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
 ┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasSgemm_v2(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float*, const float*, int, const float*, int, const float*, float*, int) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│   transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   m: type=int; val=1024
│   n: type=int; val=1024
│   k: type=int; val=1024
│   alpha: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a38)
│   A: type=float; val=POINTER (IN HEX:0x0x7f8993a00000)
│   lda: type=int; val=1024
│   B: type=float; val=POINTER (IN HEX:0x0x6200c8800)
│   ldb: type=int; val=100000
│   beta: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a3c)
│   C: type=float; val=POINTER (IN HEX:0x0x70e776000)
│   ldc: type=int; val=100000
│  Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899c3acd00); MathMode=CUBLAS_DEFAULT_MATH
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
[2]┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasSetStream_v2(cublasHandle_t, cudaStream_t) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│   streamId: type=SOME TYPE; val=POINTER (IN HEX:0x0x7f899f21b480)
│  Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899c3acd00); MathMode=CUBLAS_DEFAULT_MATH
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
 ┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasSgemm_v2(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float*, const float*, int, const float*, int, const float*, float*, int) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│   transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   m: type=int; val=1024
│   n: type=int; val=1024
│   k: type=int; val=1024
│   alpha: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a38)
│   A: type=float; val=POINTER (IN HEX:0x0x7f898a400000)
│   lda: type=int; val=1024
│   B: type=float; val=POINTER (IN HEX:0x0x6200c9800)
│   ldb: type=int; val=100000
│   beta: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a3c)
│   C: type=float; val=POINTER (IN HEX:0x0x70e776000)
│   ldc: type=int; val=100000
│  Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899f21b480); MathMode=CUBLAS_DEFAULT_MATH
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
check┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasSetStream_v2(cublasHandle_t, cudaStream_t) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│   streamId: type=SOME TYPE; val=POINTER (IN HEX:0x0x7f899c3acd00)
│  Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899f21b480); MathMode=CUBLAS_DEFAULT_MATH
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224

┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasSgemm_v2(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float*, const float*, int, const float*, int, const float*, float*, int) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│   transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   m: type=int; val=1024
│   n: type=int; val=1024
│   k: type=int; val=1024
│   alpha: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a38)
│   A: type=float; val=POINTER (IN HEX:0x0x7f8993a00000)
│   lda: type=int; val=1024
│   B: type=float; val=POINTER (IN HEX:0x0x6200ca800)
│   ldb: type=int; val=100000
│   beta: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a3c)
│   C: type=float; val=POINTER (IN HEX:0x0x70e776000)
│   ldc: type=int; val=100000
│  Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899c3acd00); MathMode=CUBLAS_DEFAULT_MATH
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
   @┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasSetStream_v2(cublasHandle_t, cudaStream_t) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│   streamId: type=SOME TYPE; val=POINTER (IN HEX:0x0x7f899f21b480)
│  Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899c3acd00); MathMode=CUBLAS_DEFAULT_MATH
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
 ┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasSgemm_v2(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float*, const float*, int, const float*, int, const float*, float*, int) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│   transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   m: type=int; val=1024
│   n: type=int; val=1024
│   k: type=int; val=1024
│   alpha: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a38)
│   A: type=float; val=POINTER (IN HEX:0x0x7f898a400000)
│   lda: type=int; val=1024
│   B: type=float; val=POINTER (IN HEX:0x0x6200cb800)
│   ldb: type=int; val=100000
│   beta: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a3c)
│   C: type=float; val=POINTER (IN HEX:0x0x70e776000)
│   ldc: type=int; val=100000
│  Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899f21b480); MathMode=CUBLAS_DEFAULT_MATH
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
~/.julia/packages/CUDA/fGE8R/lib/cublas/┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasSetStream_v2(cublasHandle_t, cudaStream_t) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│   streamId: type=SOME TYPE; val=POINTER (IN HEX:0x0x7f899c3acd00)
│  Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899f21b480); MathMode=CUBLAS_DEFAULT_MATH
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
libcublas.jl:27┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasSgemm_v2(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float*, const float*, int, const float*, int, const float*, float*, int) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│   transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   m: type=int; val=1024
│   n: type=int; val=1024
│   k: type=int; val=1024
│   alpha: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a38)
│   A: type=float; val=POINTER (IN HEX:0x0x7f8993a00000)
│   lda: type=int; val=1024
│   B: type=float; val=POINTER (IN HEX:0x0x6200cc800)
│   ldb: type=int; val=100000
│   beta: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a3c)
│   C: type=float; val=POINTER (IN HEX:0x0x70e776000)
│   ldc: type=int; val=100000
│  Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899c3acd00); MathMode=CUBLAS_DEFAULT_MATH
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
 [inlined]┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasSetStream_v2(cublasHandle_t, cudaStream_t) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│   streamId: type=SOME TYPE; val=POINTER (IN HEX:0x0x7f899f21b480)
│  Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899c3acd00); MathMode=CUBLAS_DEFAULT_MATH
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224

┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasSgemm_v2(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float*, const float*, int, const float*, int, const float*, float*, int) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│   transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   m: type=int; val=1024
│   n: type=int; val=1024
│   k: type=int; val=1024
│   alpha: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a38)
│   A: type=float; val=POINTER (IN HEX:0x0x7f898a400000)
│   lda: type=int; val=1024
│   B: type=float; val=POINTER (IN HEX:0x0x6200cd800)
│   ldb: type=int; val=100000
│   beta: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a3c)
│   C: type=float; val=POINTER (IN HEX:0x0x70e776000)
│   ldc: type=int; val=100000
│  Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899f21b480); MathMode=CUBLAS_DEFAULT_MATH
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
 ┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasSetStream_v2(cublasHandle_t, cudaStream_t) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│   streamId: type=SOME TYPE; val=POINTER (IN HEX:0x0x7f899c3acd00)
│  Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899f21b480); MathMode=CUBLAS_DEFAULT_MATH
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
[3]┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasSgemm_v2(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float*, const float*, int, const float*, int, const float*, float*, int) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│   transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   m: type=int; val=1024
│   n: type=int; val=1024
│   k: type=int; val=1024
│   alpha: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a38)
│   A: type=float; val=POINTER (IN HEX:0x0x7f8993a00000)
│   lda: type=int; val=1024
│   B: type=float; val=POINTER (IN HEX:0x0x6200ce800)
│   ldb: type=int; val=100000
│   beta: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a3c)
│   C: type=float; val=POINTER (IN HEX:0x0x70e776000)
│   ldc: type=int; val=100000
│  Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899c3acd00); MathMode=CUBLAS_DEFAULT_MATH
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
 ┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasSetStream_v2(cublasHandle_t, cudaStream_t) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│   streamId: type=SOME TYPE; val=POINTER (IN HEX:0x0x7f899f21b480)
│  Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899c3acd00); MathMode=CUBLAS_DEFAULT_MATH
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
cublasXtSgemm┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasSgemm_v2(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float*, const float*, int, const float*, int, const float*, float*, int) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│   transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   m: type=int; val=1024
│   n: type=int; val=1024
│   k: type=int; val=1024
│   alpha: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a38)
│   A: type=float; val=POINTER (IN HEX:0x0x7f898a400000)
│   lda: type=int; val=1024
│   B: type=float; val=POINTER (IN HEX:0x0x6200cf800)
│   ldb: type=int; val=100000
│   beta: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a3c)
│   C: type=float; val=POINTER (IN HEX:0x0x70e776000)
│   ldc: type=int; val=100000
│  Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899f21b480); MathMode=CUBLAS_DEFAULT_MATH
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224

┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasSetStream_v2(cublasHandle_t, cudaStream_t) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│   streamId: type=SOME TYPE; val=POINTER (IN HEX:0x0x7f899c3acd00)
│  Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899f21b480); MathMode=CUBLAS_DEFAULT_MATH
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
   @┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasSgemm_v2(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float*, const float*, int, const float*, int, const float*, float*, int) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│   transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   m: type=int; val=1024
│   n: type=int; val=1024
│   k: type=int; val=1024
│   alpha: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a38)
│   A: type=float; val=POINTER (IN HEX:0x0x7f8993a00000)
│   lda: type=int; val=1024
│   B: type=float; val=POINTER (IN HEX:0x0x6200d0800)
│   ldb: type=int; val=100000
│   beta: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a3c)
│   C: type=float; val=POINTER (IN HEX:0x0x70e776000)
│   ldc: type=int; val=100000
│  Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899c3acd00); MathMode=CUBLAS_DEFAULT_MATH
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
 ┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasSetStream_v2(cublasHandle_t, cudaStream_t) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│   streamId: type=SOME TYPE; val=POINTER (IN HEX:0x0x7f899f21b480)
│  Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899c3acd00); MathMode=CUBLAS_DEFAULT_MATH
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
~/.julia/packages/CUDA/fGE8R/lib/utils/┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasSgemm_v2(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float*, const float*, int, const float*, int, const float*, float*, int) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│   transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   m: type=int; val=1024
│   n: type=int; val=1024
│   k: type=int; val=1024
│   alpha: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a38)
│   A: type=float; val=POINTER (IN HEX:0x0x7f898a400000)
│   lda: type=int; val=1024
│   B: type=float; val=POINTER (IN HEX:0x0x6200d1800)
│   ldb: type=int; val=100000
│   beta: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a3c)
│   C: type=float; val=POINTER (IN HEX:0x0x70e776000)
│   ldc: type=int; val=100000
│  Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899f21b480); MathMode=CUBLAS_DEFAULT_MATH
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
call.jl:30┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasSetStream_v2(cublasHandle_t, cudaStream_t) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│   streamId: type=SOME TYPE; val=POINTER (IN HEX:0x0x7f899c3acd00)
│  Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899f21b480); MathMode=CUBLAS_DEFAULT_MATH
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
 [inlined]┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasSgemm_v2(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float*, const float*, int, const float*, int, const float*, float*, int) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│   transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   m: type=int; val=1024
│   n: type=int; val=1024
│   k: type=int; val=1024
│   alpha: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a38)
│   A: type=float; val=POINTER (IN HEX:0x0x7f8993a00000)
│   lda: type=int; val=1024
│   B: type=float; val=POINTER (IN HEX:0x0x6200d2800)
│   ldb: type=int; val=100000
│   beta: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a3c)
│   C: type=float; val=POINTER (IN HEX:0x0x70e776000)
│   ldc: type=int; val=100000
│  Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899c3acd00); MathMode=CUBLAS_DEFAULT_MATH
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224

┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasSetStream_v2(cublasHandle_t, cudaStream_t) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│   streamId: type=SOME TYPE; val=POINTER (IN HEX:0x0x7f899f21b480)
│  Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899c3acd00); MathMode=CUBLAS_DEFAULT_MATH
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
 ┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasSgemm_v2(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float*, const float*, int, const float*, int, const float*, float*, int) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│   transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   m: type=int; val=1024
│   n: type=int; val=1024
│   k: type=int; val=1024
│   alpha: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a38)
│   A: type=float; val=POINTER (IN HEX:0x0x7f898a400000)
│   lda: type=int; val=1024
│   B: type=float; val=POINTER (IN HEX:0x0x6200d3800)
│   ldb: type=int; val=100000
│   beta: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a3c)
│   C: type=float; val=POINTER (IN HEX:0x0x70e776000)
│   ldc: type=int; val=100000
│  Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899f21b480); MathMode=CUBLAS_DEFAULT_MATH
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
[4]┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasSetStream_v2(cublasHandle_t, cudaStream_t) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│   streamId: type=SOME TYPE; val=POINTER (IN HEX:0x0x7f899c3acd00)
│  Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899f21b480); MathMode=CUBLAS_DEFAULT_MATH
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
 ┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasSgemm_v2(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float*, const float*, int, const float*, int, const float*, float*, int) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│   transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   m: type=int; val=1024
│   n: type=int; val=1024
│   k: type=int; val=1024
│   alpha: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a38)
│   A: type=float; val=POINTER (IN HEX:0x0x7f8993a00000)
│   lda: type=int; val=1024
│   B: type=float; val=POINTER (IN HEX:0x0x6200d4800)
│   ldb: type=int; val=100000
│   beta: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a3c)
│   C: type=float; val=POINTER (IN HEX:0x0x70e776000)
│   ldc: type=int; val=100000
│  Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899c3acd00); MathMode=CUBLAS_DEFAULT_MATH
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
xt_gemm!(transA::Char, transB::Char, alpha::Int64, A::Matrix{Float32}, B::CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, beta::Int64, C::CuArray{Float32, 2, CUDA.Mem.DeviceBuffer})┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasSetStream_v2(cublasHandle_t, cudaStream_t) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│   streamId: type=SOME TYPE; val=POINTER (IN HEX:0x0x7f899f21b480)
│  Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899c3acd00); MathMode=CUBLAS_DEFAULT_MATH
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224

┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasSgemm_v2(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float*, const float*, int, const float*, int, const float*, float*, int) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│   transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   m: type=int; val=1024
│   n: type=int; val=1024
│   k: type=int; val=1024
│   alpha: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a38)
│   A: type=float; val=POINTER (IN HEX:0x0x7f898a400000)
│   lda: type=int; val=1024
│   B: type=float; val=POINTER (IN HEX:0x0x6200d5800)
│   ldb: type=int; val=100000
│   beta: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a3c)
│   C: type=float; val=POINTER (IN HEX:0x0x70e776000)
│   ldc: type=int; val=100000
│  Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899f21b480); MathMode=CUBLAS_DEFAULT_MATH
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
   @┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasSetStream_v2(cublasHandle_t, cudaStream_t) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│   streamId: type=SOME TYPE; val=POINTER (IN HEX:0x0x7f899c3acd00)
│  Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899f21b480); MathMode=CUBLAS_DEFAULT_MATH
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
 ┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasSgemm_v2(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float*, const float*, int, const float*, int, const float*, float*, int) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│   transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   m: type=int; val=1024
│   n: type=int; val=1024
│   k: type=int; val=1024
│   alpha: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a38)
│   A: type=float; val=POINTER (IN HEX:0x0x7f8993a00000)
│   lda: type=int; val=1024
│   B: type=float; val=POINTER (IN HEX:0x0x6200d6800)
│   ldb: type=int; val=100000
│   beta: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a3c)
│   C: type=float; val=POINTER (IN HEX:0x0x70e776000)
│   ldc: type=int; val=100000
│  Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899c3acd00); MathMode=CUBLAS_DEFAULT_MATH
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
CUDA.CUBLAS┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasSetStream_v2(cublasHandle_t, cudaStream_t) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│   streamId: type=SOME TYPE; val=POINTER (IN HEX:0x0x7f899f21b480)
│  Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899c3acd00); MathMode=CUBLAS_DEFAULT_MATH
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
 ┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasSgemm_v2(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float*, const float*, int, const float*, int, const float*, float*, int) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│   transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   m: type=int; val=1024
│   n: type=int; val=1024
│   k: type=int; val=1024
│   alpha: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a38)
│   A: type=float; val=POINTER (IN HEX:0x0x7f898a400000)
│   lda: type=int; val=1024
│   B: type=float; val=POINTER (IN HEX:0x0x6200d7800)
│   ldb: type=int; val=100000
│   beta: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a3c)
│   C: type=float; val=POINTER (IN HEX:0x0x70e776000)
│   ldc: type=int; val=100000
│  Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899f21b480); MathMode=CUBLAS_DEFAULT_MATH
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
~/.julia/packages/CUDA/fGE8R/lib/cublas/┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasSetStream_v2(cublasHandle_t, cudaStream_t) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│   streamId: type=SOME TYPE; val=POINTER (IN HEX:0x0x7f899c3acd00)
│  Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899f21b480); MathMode=CUBLAS_DEFAULT_MATH
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
wrappers.jl:2145┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasSgemm_v2(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float*, const float*, int, const float*, int, const float*, float*, int) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│   transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   m: type=int; val=1024
│   n: type=int; val=1024
│   k: type=int; val=1024
│   alpha: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a38)
│   A: type=float; val=POINTER (IN HEX:0x0x7f8993a00000)
│   lda: type=int; val=1024
│   B: type=float; val=POINTER (IN HEX:0x0x6200d8800)
│   ldb: type=int; val=100000
│   beta: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a3c)
│   C: type=float; val=POINTER (IN HEX:0x0x70e776000)
│   ldc: type=int; val=100000
│  Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899c3acd00); MathMode=CUBLAS_DEFAULT_MATH
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasSetStream_v2(cublasHandle_t, cudaStream_t) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│   streamId: type=SOME TYPE; val=POINTER (IN HEX:0x0x7f899f21b480)
│  Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899c3acd00); MathMode=CUBLAS_DEFAULT_MATH
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224

┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasSgemm_v2(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float*, const float*, int, const float*, int, const float*, float*, int) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│   transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   m: type=int; val=1024
│   n: type=int; val=1024
│   k: type=int; val=1024
│   alpha: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a38)
│   A: type=float; val=POINTER (IN HEX:0x0x7f898a400000)
│   lda: type=int; val=1024
│   B: type=float; val=POINTER (IN HEX:0x0x6200d9800)
│   ldb: type=int; val=100000
│   beta: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a3c)
│   C: type=float; val=POINTER (IN HEX:0x0x70e776000)
│   ldc: type=int; val=100000
│  Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899f21b480); MathMode=CUBLAS_DEFAULT_MATH
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
 ┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasSetStream_v2(cublasHandle_t, cudaStream_t) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│   streamId: type=SOME TYPE; val=POINTER (IN HEX:0x0x7f899c3acd00)
│  Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899f21b480); MathMode=CUBLAS_DEFAULT_MATH
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
[5]┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasSgemm_v2(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float*, const float*, int, const float*, int, const float*, float*, int) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│   transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   m: type=int; val=1024
│   n: type=int; val=1024
│   k: type=int; val=1024
│   alpha: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a38)
│   A: type=float; val=POINTER (IN HEX:0x0x7f8993a00000)
│   lda: type=int; val=1024
│   B: type=float; val=POINTER (IN HEX:0x0x6200da800)
│   ldb: type=int; val=100000
│   beta: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a3c)
│   C: type=float; val=POINTER (IN HEX:0x0x70e776000)
│   ldc: type=int; val=100000
│  Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899c3acd00); MathMode=CUBLAS_DEFAULT_MATH
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
 ┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasSetStream_v2(cublasHandle_t, cudaStream_t) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│   streamId: type=SOME TYPE; val=POINTER (IN HEX:0x0x7f899f21b480)
│  Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899c3acd00); MathMode=CUBLAS_DEFAULT_MATH
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
top-level scope┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasSgemm_v2(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float*, const float*, int, const float*, int, const float*, float*, int) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│   transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   m: type=int; val=1024
│   n: type=int; val=1024
│   k: type=int; val=1024
│   alpha: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a38)
│   A: type=float; val=POINTER (IN HEX:0x0x7f898a400000)
│   lda: type=int; val=1024
│   B: type=float; val=POINTER (IN HEX:0x0x6200db800)
│   ldb: type=int; val=100000
│   beta: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a3c)
│   C: type=float; val=POINTER (IN HEX:0x0x70e776000)
│   ldc: type=int; val=100000
│  Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899f21b480); MathMode=CUBLAS_DEFAULT_MATH
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224

┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasSetStream_v2(cublasHandle_t, cudaStream_t) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│   streamId: type=SOME TYPE; val=POINTER (IN HEX:0x0x7f899c3acd00)
│  Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899f21b480); MathMode=CUBLAS_DEFAULT_MATH
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
   @┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasSgemm_v2(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float*, const float*, int, const float*, int, const float*, float*, int) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│   transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   m: type=int; val=1024
│   n: type=int; val=1024
│   k: type=int; val=1024
│   alpha: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a38)
│   A: type=float; val=POINTER (IN HEX:0x0x7f8993a00000)
│   lda: type=int; val=1024
│   B: type=float; val=POINTER (IN HEX:0x0x6200dc800)
│   ldb: type=int; val=100000
│   beta: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a3c)
│   C: type=float; val=POINTER (IN HEX:0x0x70e776000)
│   ldc: type=int; val=100000
│  Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899c3acd00); MathMode=CUBLAS_DEFAULT_MATH
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
 ┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasSetStream_v2(cublasHandle_t, cudaStream_t) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│   streamId: type=SOME TYPE; val=POINTER (IN HEX:0x0x7f899f21b480)
│  Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899c3acd00); MathMode=CUBLAS_DEFAULT_MATH
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
~/mwe_xt_gemm/┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasSgemm_v2(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float*, const float*, int, const float*, int, const float*, float*, int) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│   transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   m: type=int; val=1024
│   n: type=int; val=1024
│   k: type=int; val=1024
│   alpha: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a38)
│   A: type=float; val=POINTER (IN HEX:0x0x7f898a400000)
│   lda: type=int; val=1024
│   B: type=float; val=POINTER (IN HEX:0x0x6200dd800)
│   ldb: type=int; val=100000
│   beta: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a3c)
│   C: type=float; val=POINTER (IN HEX:0x0x70e776000)
│   ldc: type=int; val=100000
│  Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899f21b480); MathMode=CUBLAS_DEFAULT_MATH
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
mwe.jl:16┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasSetStream_v2(cublasHandle_t, cudaStream_t) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│   streamId: type=SOME TYPE; val=POINTER (IN HEX:0x0x7f899c3acd00)
│  Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899f21b480); MathMode=CUBLAS_DEFAULT_MATH
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasSgemm_v2(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float*, const float*, int, const float*, int, const float*, float*, int) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│   transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   m: type=int; val=1024
│   n: type=int; val=1024
│   k: type=int; val=1024
│   alpha: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a38)
│   A: type=float; val=POINTER (IN HEX:0x0x7f8993a00000)
│   lda: type=int; val=1024
│   B: type=float; val=POINTER (IN HEX:0x0x6200de800)
│   ldb: type=int; val=100000
│   beta: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a3c)
│   C: type=float; val=POINTER (IN HEX:0x0x70e776000)
│   ldc: type=int; val=100000
│  Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899c3acd00); MathMode=CUBLAS_DEFAULT_MATH
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224

in expression starting at /home/lpawela/mwe_xt_gemm/mwe.jl:16┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasSetStream_v2(cublasHandle_t, cudaStream_t) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│   streamId: type=SOME TYPE; val=POINTER (IN HEX:0x0x7f899f21b480)
│  Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899c3acd00); MathMode=CUBLAS_DEFAULT_MATH
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224

┌ Debug:  cuBLAS (v12.3) function cublasStatus_t cublasSgemm_v2(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float*, const float*, int, const float*, int, const float*, float*, int) called:
│   handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│   transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│   m: type=int; val=1024
│   n: type=int; val=1024
│   k: type=int; val=1024
│   alpha: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a38)
│   A: type=float; val=POINTER (IN HEX:0x0x7f898a400000)
│   lda: type=int; val=1024
│   B: type=float; val=POINTER (IN HEX:0x0x6200df800)
│   ldb: type=int; val=100000
│   beta: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a3c)
│   C: type=float; val=POINTER (IN HEX:0x0x70e776000)
│   ldc: type=int; val=100000
│  Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899f21b480); MathMode=CUBLAS_DEFAULT_MATH
│  COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224

As for the C++ it was a bit larger code, but this is the main part

template<class T=float>
void test(
        cublasHandle_t &handle, cublasXtHandle_t &xtHandle,
        curandGenerator_t &prng,
        size_t n, size_t m, size_t k,
        bool ah = false, bool bh = false, bool ch = false) {
    T *A;
    T *B;
    T *C;

    static T zero = 0.0;
    static T one = 1.0;
    static T mone = -1.0;
    static T is2 = pow(T(0.5), T(0.5));

    /*
    CUDA_CALL(cudaMalloc((void **)(&A), n * m * sizeof(T)));
    CUDA_CALL(cudaDeviceSynchronize());

    if (n == m) {
        T *HA;
        CUDA_CALL(cudaMalloc((void **)(&HA), n * n * sizeof(T)));
        CUDA_CALL(cudaDeviceSynchronize());
        CURAND_CALL(curandGenerateNormalAny(prng, HA, n * m, zero, is2));
        CUDA_CALL(cudaDeviceSynchronize());
        CUBLAS_CALL(cublasgeam<T>(
            handle, CUBLAS_OP_N, CUBLAS_OP_T,
            n, n, &one, HA, n, &mone, HA, n, A, n));
        CUDA_CALL(cudaDeviceSynchronize());
        CUDA_CALL(cudaFree(HA)); HA = nullptr;
    } else {
        CURAND_CALL(curandGenerateNormalAny(prng, A, n * m, zero, one));
    }
    CUDA_CALL(cudaDeviceSynchronize());
    */

    CUDA_CALL(cudaMalloc((void **)(&B), m * k * sizeof(T)));
    CUDA_CALL(cudaDeviceSynchronize());

    CURAND_CALL(curandGenerateNormalAny(prng, B, m * k, zero, one));
    CUDA_CALL(cudaDeviceSynchronize());

    CUDA_CALL(cudaMalloc((void **)(&C), n * k * sizeof(T)));
    CUDA_CALL(cudaDeviceSynchronize());

    cudaEvent_t start, stop;

    /*if (ah) {
        T *oM, *hM = (T *)(malloc(n * m * sizeof(T)));
        CUDA_CALL(cudaMemcpy(
            hM, A, n * m * sizeof(T), cudaMemcpyDeviceToHost));
        oM = A; A = hM;
        CUDA_CALL(cudaFree(oM)); oM = nullptr;
    }*/
    A = (T *)(malloc(n * m * sizeof(T)));

    if (bh) {
        T *oM, *hM = (T *)(malloc(m * k * sizeof(T)));
        CUDA_CALL(cudaMemcpy(
            hM, B, m * k * sizeof(T), cudaMemcpyDeviceToHost));
        oM = B; B = hM;
        CUDA_CALL(cudaFree(oM)); oM = nullptr;
    }

    if (ch) {
        T *oM, *hM = (T *)(malloc(n * k * sizeof(T)));
        CUDA_CALL(cudaMemcpy(
            hM, C, n * k * sizeof(T), cudaMemcpyDeviceToHost));
        oM = C; C = hM;
        CUDA_CALL(cudaFree(oM)); oM = nullptr;
    }

    CUDA_CALL(cudaEventCreate(&start));
    CUDA_CALL(cudaEventCreate(&stop));

    CUDA_CALL(cudaEventRecord(start));
    CUDA_CALL(cudaEventSynchronize(start));
    CUBLAS_CALL(cublasXtgemm(
        xtHandle, CUBLAS_OP_N, CUBLAS_OP_N,
        n, k, m, &one, A, n, B, m, &zero, C, n));
    CUDA_CALL(cudaDeviceSynchronize());
    CUDA_CALL(cudaEventRecord(stop));
    CUDA_CALL(cudaEventSynchronize(stop));

    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
    printf(
        "%-8s %-6s %7lu %7lu %7lu %-6s %-6s %-6s %10.3f\n",
        "cublasXt",
        theTypename<T>(),
        n, m, k,
        ah ? "HOST" : "DEVICE",
        bh ? "HOST" : "DEVICE",
        ch ? "HOST" : "DEVICE",
        milliseconds);

    CUDA_CALL(cudaEventDestroy(stop));
    CUDA_CALL(cudaEventDestroy(start));
    CUDA_CALL(cudaDeviceSynchronize());

    /*
    T *dA = A, *dB = B, *dC = C;
    if (ah) CUDA_CALL(cudaMalloc((void **)(&dA), n * m * sizeof(T)));
    if (bh) CUDA_CALL(cudaMalloc((void **)(&dB), m * k * sizeof(T)));
    if (ch) CUDA_CALL(cudaMalloc((void **)(&dC), n * k * sizeof(T)));
    CUDA_CALL(cudaEventCreate(&start));
    CUDA_CALL(cudaEventCreate(&stop));

    CUDA_CALL(cudaEventRecord(start));
    CUDA_CALL(cudaEventSynchronize(start));

    if (ah)
        CUDA_CALL(cudaMemcpy(
            dA, A, n * m * sizeof(T), cudaMemcpyHostToDevice));

    if (bh)
        CUDA_CALL(cudaMemcpy(
            dB, B, m * k * sizeof(T), cudaMemcpyHostToDevice));

    if (ch)
        CUDA_CALL(cudaMemcpy(
            dC, C, n * k * sizeof(T), cudaMemcpyHostToDevice));

    CUBLAS_CALL(cublasgemm(
        handle, CUBLAS_OP_N, CUBLAS_OP_N,
        n, k, m, &one, dA, n, dB, m, &zero, dC, n));
    CUDA_CALL(cudaDeviceSynchronize());
    CUDA_CALL(cudaEventRecord(stop));
    CUDA_CALL(cudaEventSynchronize(stop));

    if (ch) CUDA_CALL(cudaFree(dC));
    dC = nullptr;

    if (bh) CUDA_CALL(cudaFree(dB));
    dB = nullptr;

    if (ah) CUDA_CALL(cudaFree(dA));
    dA = nullptr;

    milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
    printf(
        "%-8s %-6s %7lu %7lu %7lu %-6s %-6s %-6s %10.3f\n",
        "CUBLAS",
        theTypename<T>(),
        n, m, k,
        ah ? "HOST" : "DEVICE",
        bh ? "HOST" : "DEVICE",
        ch ? "HOST" : "DEVICE",
        milliseconds);

    CUDA_CALL(cudaEventDestroy(stop));
    CUDA_CALL(cudaEventDestroy(start));
    CUDA_CALL(cudaDeviceSynchronize());
    */

    ch ? free(C) : CUDA_CALL(cudaFree(C)); C = nullptr;
    bh ? free(B) : CUDA_CALL(cudaFree(B)); B = nullptr;
    ah ? free(A) : CUDA_CALL(cudaFree(A)); A = nullptr;
    CUDA_CALL(cudaDeviceSynchronize());
}


int main() {
    cublasHandle_t handle;
    CUBLAS_CALL(cublasCreate(&handle));

    cublasXtHandle_t xtHandle;
    CUBLAS_CALL(cublasXtCreate(&xtHandle));

    int device_count = 1;
    CUDA_CALL(cudaGetDeviceCount(&device_count));
    printf("device_count = %d\n", device_count);
    int *device_ids = (int *)(malloc(device_count * sizeof(int)));
    for (int idx = 0; idx < device_count; ++idx) {
        device_ids[idx] = idx;
    }
    CUBLAS_CALL(cublasXtDeviceSelect(xtHandle, device_count, device_ids));
    free(device_ids); device_ids = nullptr;

    curandGenerator_t prng;
    CURAND_CALL(curandCreateGenerator(&prng, CURAND_RNG_PSEUDO_XORWOW));
    CURAND_CALL(curandSetPseudoRandomGeneratorSeed(prng, 0xDEADBEEF));

    int p = 35000;
    printf(
        "%-8s %-6s %7s %7s %7s %-6s %-6s %-6s %10s\n",
        "library", "type", "n", "m", "k",
        "A_mem", "B_mem", "C_mem", "time [ms]");
    //for (size_t idx = 0; idx < 8; ++idx) {
    size_t idx = 1;
        bool ah = idx & 1;
        bool bh = idx & 2;
        bool ch = idx & 4;
        test(handle, xtHandle, prng, 10 * p, 10 * p, 1000, ah, bh, ch);
        test<double>(handle, xtHandle, prng, 10 * p, 10 * p, 1000, ah, bh, ch);
    //}

    CURAND_CALL(curandDestroyGenerator(prng));
    CUBLAS_CALL(cublasXtDestroy(xtHandle));
    CUBLAS_CALL(cublasDestroy(handle));
    CUDA_CALL(cudaDeviceSynchronize());

    return 0;
}

C++ MWE that does reproduce the error:

#include <iostream>
#include <vector>
#include <cuda.h>
#include <cublasXt.h>

// Error checking for CUDA Driver API
#define CUDA_CHECK(call) { gpuAssert((call), __FILE__, __LINE__); }
inline void gpuAssert(CUresult code, const char *file, int line, bool abort=true) {
    if (code != CUDA_SUCCESS) {
        const char *error_string;
        cuGetErrorString(code, &error_string);
        std::cerr << "CUDA Driver API error: " << error_string << " at " << file << ":" << line << std::endl;
        if (abort) exit(code);
    }
}

// Error checking for CUBLAS API
#define CUBLAS_CHECK(status) { cublasAssert((status), __FILE__, __LINE__); }
inline void cublasAssert(cublasStatus_t status, const char *file, int line, bool abort=true) {
    if (status != CUBLAS_STATUS_SUCCESS) {
        std::cerr << "CUBLAS API error: " << status << " at " << file << ":" << line << std::endl;
        if (abort) exit(status);
    }
}

int main() {
    // Initialize CUDA
    CUDA_CHECK(cuInit(0));

    // Set up primary contexts for both devices
    std::vector<int> device_ids = {0, 1};
    std::vector<CUcontext> contexts(device_ids.size());
    for (auto id : device_ids) {
        CUdevice cuDevice;
        CUDA_CHECK(cuDeviceGet(&cuDevice, id));
        CUDA_CHECK(cuDevicePrimaryCtxRetain(&contexts[id], cuDevice));
    }

    // Activate the first device's context
    CUDA_CHECK(cuCtxSetCurrent(contexts[0]));

    // Create a stream for operations
    CUstream stream;
    CUDA_CHECK(cuStreamCreate(&stream, CU_STREAM_DEFAULT));

    // Allocate memory from the pool
    CUdeviceptr d_A, d_B, d_C;
    size_t m = 100000, n = 10000, k = 100000;
    size_t bytes_A = m * k * sizeof(float);
    size_t bytes_B = k * n * sizeof(float);
    size_t bytes_C = m * n * sizeof(float);
    CUDA_CHECK(cuMemAllocAsync(&d_A, bytes_A, stream));
    CUDA_CHECK(cuMemAllocAsync(&d_B, bytes_B, stream));
    CUDA_CHECK(cuMemAllocAsync(&d_C, bytes_C, stream));

    // Set up CUBLAS Xt
    cublasXtHandle_t xtHandle;
    CUBLAS_CHECK(cublasXtCreate(&xtHandle));

    // Configure CUBLAS Xt to use both devices
    CUBLAS_CHECK(cublasXtDeviceSelect(xtHandle, device_ids.size(), device_ids.data()));

    // Perform the matrix multiplication
    float alpha = 1.0f, beta = 0.0f;
    CUBLAS_CHECK(cublasXtSgemm(xtHandle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, (float*)d_A, m, (float*)d_B, k, &beta, (float*)d_C, m));

    return 0;
}

It seems related to the cuMemAllocAsync.

Filed a bug with NVIDIA. Workaround: run with JULIA_CUDA_MEMORY_POOL=none in your environment.

I can confirm the workaround makes the error go away.