cuBLASXt's `xt_gemm!` incompatible with stream-ordered allocated memory
lpawela opened this issue · comments
If your bug is still valid, please go ahead and fill out the template below.
Describe the bug
After launching the second part of the MWE I get the following error
ERROR: LoadError: CUBLASError: an access to GPU memory space failed (code 11, CUBLAS_STATUS_MAPPING_ERROR)
Stacktrace:
[1] throw_api_error(res::CUDA.CUBLAS.cublasStatus_t)
@ CUDA.CUBLAS ~/.julia/packages/CUDA/htRwP/lib/cublas/libcublas.jl:11
[2] check
@ ~/.julia/packages/CUDA/htRwP/lib/cublas/libcublas.jl:21 [inlined]
[3] cublasXtSgemm
@ ~/.julia/packages/CUDA/htRwP/lib/utils/call.jl:26 [inlined]
[4] xt_gemm!(transA::Char, transB::Char, alpha::Int64, A::Matrix{Float32}, B::CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, beta::Int64, C::CuArray{Float32, 2, CUDA.Mem.DeviceBuffer})
@ CUDA.CUBLAS ~/.julia/packages/CUDA/htRwP/lib/cublas/wrappers.jl:1932
[5] top-level scope
@ ~/mwe_xt_gemm/mwe.jl:16
in expression starting at /home/lpawela/mwe_xt_gemm/mwe.jl:16
To reproduce
The Minimal Working Example (MWE) for this bug:
using CUDA
using LinearAlgebra
a = rand(Float32, 1000, 1000)
b = CUDA.rand(1000, 100)
c = CUDA.zeros(1000, 100)
CUBLAS.xt_gemm!('N', 'N', 1, a, b, 0, c) # WORKS
@assert isapprox(norm(a * Array(b) - Array(c)), 0; atol=0.1) # WORKS
println("First test passed")
a = rand(Float32, 100_000, 100_000)
b = CUDA.rand(100_000, 10_000)
c = CUDA.zeros(100_000, 10_000)
CUBLAS.xt_gemm!('N', 'N', 1, a, b, 0, c) # FAILS
Manifest.toml
# This file is machine-generated - editing it directly is not advised
julia_version = "1.10.2"
manifest_format = "2.0"
project_hash = "4d6f52122ea9741175c8c71d00021be0a921f3ad"
[[deps.AbstractFFTs]]
deps = ["LinearAlgebra"]
git-tree-sha1 = "d92ad398961a3ed262d8bf04a1a2b8340f915fef"
uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c"
version = "1.5.0"
[deps.AbstractFFTs.extensions]
AbstractFFTsChainRulesCoreExt = "ChainRulesCore"
AbstractFFTsTestExt = "Test"
[deps.AbstractFFTs.weakdeps]
ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
[[deps.Adapt]]
deps = ["LinearAlgebra", "Requires"]
git-tree-sha1 = "6a55b747d1812e699320963ffde36f1ebdda4099"
uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
version = "4.0.4"
weakdeps = ["StaticArrays"]
[deps.Adapt.extensions]
AdaptStaticArraysExt = "StaticArrays"
[[deps.ArgTools]]
uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f"
version = "1.1.1"
[[deps.Artifacts]]
uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
[[deps.Atomix]]
deps = ["UnsafeAtomics"]
git-tree-sha1 = "c06a868224ecba914baa6942988e2f2aade419be"
uuid = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
version = "0.1.0"
[[deps.BFloat16s]]
deps = ["LinearAlgebra", "Printf", "Random", "Test"]
git-tree-sha1 = "2c7cc21e8678eff479978a0a2ef5ce2f51b63dff"
uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b"
version = "0.5.0"
[[deps.Base64]]
uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
[[deps.CEnum]]
git-tree-sha1 = "389ad5c84de1ae7cf0e28e381131c98ea87d54fc"
uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82"
version = "0.5.0"
[[deps.CUDA]]
deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CUDA_Driver_jll", "CUDA_Runtime_Discovery", "CUDA_Runtime_jll", "Crayons", "DataFrames", "ExprTools", "GPUArrays", "GPUCompiler", "KernelAbstractions", "LLVM", "LLVMLoopInfo", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "NVTX", "Preferences", "PrettyTables", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "StaticArrays", "Statistics"]
git-tree-sha1 = "baa8ea7a1ea63316fa3feb454635215773c9c845"
uuid = "052768ef-5323-5732-b1bb-66c8b64840ba"
version = "5.2.0"
[deps.CUDA.extensions]
ChainRulesCoreExt = "ChainRulesCore"
SpecialFunctionsExt = "SpecialFunctions"
[deps.CUDA.weakdeps]
ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
[[deps.CUDA_Driver_jll]]
deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "Pkg"]
git-tree-sha1 = "d01bfc999768f0a31ed36f5d22a76161fc63079c"
uuid = "4ee394cb-3365-5eb0-8335-949819d2adfc"
version = "0.7.0+1"
[[deps.CUDA_Runtime_Discovery]]
deps = ["Libdl"]
git-tree-sha1 = "38f830504358e9972d2a0c3e5d51cb865e0733df"
uuid = "1af6417a-86b4-443c-805f-a4643ffb695f"
version = "0.2.4"
[[deps.CUDA_Runtime_jll]]
deps = ["Artifacts", "CUDA_Driver_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"]
git-tree-sha1 = "8e25c009d2bf16c2c31a70a6e9e8939f7325cc84"
uuid = "76a88914-d11a-5bdc-97e0-2f5a05c973a2"
version = "0.11.1+0"
[[deps.ColorTypes]]
deps = ["FixedPointNumbers", "Random"]
git-tree-sha1 = "b10d0b65641d57b8b4d5e234446582de5047050d"
uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f"
version = "0.11.5"
[[deps.Colors]]
deps = ["ColorTypes", "FixedPointNumbers", "Reexport"]
git-tree-sha1 = "fc08e5930ee9a4e03f84bfb5211cb54e7769758a"
uuid = "5ae59095-9a9b-59fe-a467-6f913c188581"
version = "0.12.10"
[[deps.Compat]]
deps = ["TOML", "UUIDs"]
git-tree-sha1 = "c955881e3c981181362ae4088b35995446298b80"
uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
version = "4.14.0"
weakdeps = ["Dates", "LinearAlgebra"]
[deps.Compat.extensions]
CompatLinearAlgebraExt = "LinearAlgebra"
[[deps.CompilerSupportLibraries_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
version = "1.1.0+0"
[[deps.Crayons]]
git-tree-sha1 = "249fe38abf76d48563e2f4556bebd215aa317e15"
uuid = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f"
version = "4.1.1"
[[deps.DataAPI]]
git-tree-sha1 = "abe83f3a2f1b857aac70ef8b269080af17764bbe"
uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
version = "1.16.0"
[[deps.DataFrames]]
deps = ["Compat", "DataAPI", "DataStructures", "Future", "InlineStrings", "InvertedIndices", "IteratorInterfaceExtensions", "LinearAlgebra", "Markdown", "Missings", "PooledArrays", "PrecompileTools", "PrettyTables", "Printf", "REPL", "Random", "Reexport", "SentinelArrays", "SortingAlgorithms", "Statistics", "TableTraits", "Tables", "Unicode"]
git-tree-sha1 = "04c738083f29f86e62c8afc341f0967d8717bdb8"
uuid = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
version = "1.6.1"
[[deps.DataStructures]]
deps = ["Compat", "InteractiveUtils", "OrderedCollections"]
git-tree-sha1 = "0f4b5d62a88d8f59003e43c25a8a90de9eb76317"
uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
version = "0.18.18"
[[deps.DataValueInterfaces]]
git-tree-sha1 = "bfc1187b79289637fa0ef6d4436ebdfe6905cbd6"
uuid = "e2d170a0-9d28-54be-80f0-106bbe20a464"
version = "1.0.0"
[[deps.Dates]]
deps = ["Printf"]
uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
[[deps.Downloads]]
deps = ["ArgTools", "FileWatching", "LibCURL", "NetworkOptions"]
uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
version = "1.6.0"
[[deps.ExprTools]]
git-tree-sha1 = "27415f162e6028e81c72b82ef756bf321213b6ec"
uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
version = "0.1.10"
[[deps.FileWatching]]
uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee"
[[deps.FixedPointNumbers]]
deps = ["Statistics"]
git-tree-sha1 = "335bfdceacc84c5cdf16aadc768aa5ddfc5383cc"
uuid = "53c48c17-4a7d-5ca2-90c5-79b7896eea93"
version = "0.8.4"
[[deps.Future]]
deps = ["Random"]
uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820"
[[deps.GPUArrays]]
deps = ["Adapt", "GPUArraysCore", "LLVM", "LinearAlgebra", "Printf", "Random", "Reexport", "Serialization", "Statistics"]
git-tree-sha1 = "68e8ff56a4a355a85d2784b94614491f8c900cde"
uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
version = "10.1.0"
[[deps.GPUArraysCore]]
deps = ["Adapt"]
git-tree-sha1 = "ec632f177c0d990e64d955ccc1b8c04c485a0950"
uuid = "46192b85-c4d5-4398-a991-12ede77f4527"
version = "0.1.6"
[[deps.GPUCompiler]]
deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Scratch", "TimerOutputs", "UUIDs"]
git-tree-sha1 = "a846f297ce9d09ccba02ead0cae70690e072a119"
uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
version = "0.25.0"
[[deps.InlineStrings]]
deps = ["Parsers"]
git-tree-sha1 = "9cc2baf75c6d09f9da536ddf58eb2f29dedaf461"
uuid = "842dd82b-1e85-43dc-bf29-5d0ee9dffc48"
version = "1.4.0"
[[deps.InteractiveUtils]]
deps = ["Markdown"]
uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
[[deps.InvertedIndices]]
git-tree-sha1 = "0dc7b50b8d436461be01300fd8cd45aa0274b038"
uuid = "41ab1584-1d38-5bbf-9106-f11c6c58b48f"
version = "1.3.0"
[[deps.IteratorInterfaceExtensions]]
git-tree-sha1 = "a3f24677c21f5bbe9d2a714f95dcd58337fb2856"
uuid = "82899510-4779-5014-852e-03e436cf321d"
version = "1.0.0"
[[deps.JLLWrappers]]
deps = ["Artifacts", "Preferences"]
git-tree-sha1 = "7e5d6779a1e09a36db2a7b6cff50942a0a7d0fca"
uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210"
version = "1.5.0"
[[deps.JuliaNVTXCallbacks_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "af433a10f3942e882d3c671aacb203e006a5808f"
uuid = "9c1d0b0a-7046-5b2e-a33f-ea22f176ac7e"
version = "0.2.1+0"
[[deps.KernelAbstractions]]
deps = ["Adapt", "Atomix", "InteractiveUtils", "LinearAlgebra", "MacroTools", "PrecompileTools", "Requires", "SparseArrays", "StaticArrays", "UUIDs", "UnsafeAtomics", "UnsafeAtomicsLLVM"]
git-tree-sha1 = "ed7167240f40e62d97c1f5f7735dea6de3cc5c49"
uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
version = "0.9.18"
[deps.KernelAbstractions.extensions]
EnzymeExt = "EnzymeCore"
[deps.KernelAbstractions.weakdeps]
EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"
[[deps.LLVM]]
deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Preferences", "Printf", "Requires", "Unicode"]
git-tree-sha1 = "839c82932db86740ae729779e610f07a1640be9a"
uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
version = "6.6.3"
weakdeps = ["BFloat16s"]
[deps.LLVM.extensions]
BFloat16sExt = "BFloat16s"
[[deps.LLVMExtra_jll]]
deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"]
git-tree-sha1 = "88b916503aac4fb7f701bb625cd84ca5dd1677bc"
uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab"
version = "0.0.29+0"
[[deps.LLVMLoopInfo]]
git-tree-sha1 = "2e5c102cfc41f48ae4740c7eca7743cc7e7b75ea"
uuid = "8b046642-f1f6-4319-8d3c-209ddc03c586"
version = "1.0.0"
[[deps.LaTeXStrings]]
git-tree-sha1 = "50901ebc375ed41dbf8058da26f9de442febbbec"
uuid = "b964fa9f-0449-5b57-a5c2-d3ea65f4040f"
version = "1.3.1"
[[deps.LazyArtifacts]]
deps = ["Artifacts", "Pkg"]
uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3"
[[deps.LibCURL]]
deps = ["LibCURL_jll", "MozillaCACerts_jll"]
uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21"
version = "0.6.4"
[[deps.LibCURL_jll]]
deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"]
uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0"
version = "8.4.0+0"
[[deps.LibGit2]]
deps = ["Base64", "LibGit2_jll", "NetworkOptions", "Printf", "SHA"]
uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
[[deps.LibGit2_jll]]
deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll"]
uuid = "e37daf67-58a4-590a-8e99-b0245dd2ffc5"
version = "1.6.4+0"
[[deps.LibSSH2_jll]]
deps = ["Artifacts", "Libdl", "MbedTLS_jll"]
uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8"
version = "1.11.0+1"
[[deps.Libdl]]
uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
[[deps.LinearAlgebra]]
deps = ["Libdl", "OpenBLAS_jll", "libblastrampoline_jll"]
uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
[[deps.Logging]]
uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
[[deps.MacroTools]]
deps = ["Markdown", "Random"]
git-tree-sha1 = "2fa9ee3e63fd3a4f7a9a4f4744a52f4856de82df"
uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
version = "0.5.13"
[[deps.Markdown]]
deps = ["Base64"]
uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
[[deps.MbedTLS_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1"
version = "2.28.2+1"
[[deps.Missings]]
deps = ["DataAPI"]
git-tree-sha1 = "ec4f7fbeab05d7747bdf98eb74d130a2a2ed298d"
uuid = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
version = "1.2.0"
[[deps.MozillaCACerts_jll]]
uuid = "14a3606d-f60d-562e-9121-12d972cd8159"
version = "2023.1.10"
[[deps.NVTX]]
deps = ["Colors", "JuliaNVTXCallbacks_jll", "Libdl", "NVTX_jll"]
git-tree-sha1 = "53046f0483375e3ed78e49190f1154fa0a4083a1"
uuid = "5da4648a-3479-48b8-97b9-01cb529c0a1f"
version = "0.3.4"
[[deps.NVTX_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "ce3269ed42816bf18d500c9f63418d4b0d9f5a3b"
uuid = "e98f9f5b-d649-5603-91fd-7774390e6439"
version = "3.1.0+2"
[[deps.NetworkOptions]]
uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908"
version = "1.2.0"
[[deps.OpenBLAS_jll]]
deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"]
uuid = "4536629a-c528-5b80-bd46-f80d51c5b363"
version = "0.3.23+4"
[[deps.OrderedCollections]]
git-tree-sha1 = "dfdf5519f235516220579f949664f1bf44e741c5"
uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
version = "1.6.3"
[[deps.Parsers]]
deps = ["Dates", "PrecompileTools", "UUIDs"]
git-tree-sha1 = "8489905bcdbcfac64d1daa51ca07c0d8f0283821"
uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
version = "2.8.1"
[[deps.Pkg]]
deps = ["Artifacts", "Dates", "Downloads", "FileWatching", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"]
uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
version = "1.10.0"
[[deps.PooledArrays]]
deps = ["DataAPI", "Future"]
git-tree-sha1 = "36d8b4b899628fb92c2749eb488d884a926614d3"
uuid = "2dfb63ee-cc39-5dd5-95bd-886bf059d720"
version = "1.4.3"
[[deps.PrecompileTools]]
deps = ["Preferences"]
git-tree-sha1 = "5aa36f7049a63a1528fe8f7c3f2113413ffd4e1f"
uuid = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
version = "1.2.1"
[[deps.Preferences]]
deps = ["TOML"]
git-tree-sha1 = "9306f6085165d270f7e3db02af26a400d580f5c6"
uuid = "21216c6a-2e73-6563-6e65-726566657250"
version = "1.4.3"
[[deps.PrettyTables]]
deps = ["Crayons", "LaTeXStrings", "Markdown", "PrecompileTools", "Printf", "Reexport", "StringManipulation", "Tables"]
git-tree-sha1 = "88b895d13d53b5577fd53379d913b9ab9ac82660"
uuid = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d"
version = "2.3.1"
[[deps.Printf]]
deps = ["Unicode"]
uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
[[deps.REPL]]
deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"]
uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
[[deps.Random]]
deps = ["SHA"]
uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
[[deps.Random123]]
deps = ["Random", "RandomNumbers"]
git-tree-sha1 = "4743b43e5a9c4a2ede372de7061eed81795b12e7"
uuid = "74087812-796a-5b5d-8853-05524746bad3"
version = "1.7.0"
[[deps.RandomNumbers]]
deps = ["Random", "Requires"]
git-tree-sha1 = "043da614cc7e95c703498a491e2c21f58a2b8111"
uuid = "e6cf234a-135c-5ec9-84dd-332b85af5143"
version = "1.5.3"
[[deps.Reexport]]
git-tree-sha1 = "45e428421666073eab6f2da5c9d310d99bb12f9b"
uuid = "189a3867-3050-52da-a836-e630ba90ab69"
version = "1.2.2"
[[deps.Requires]]
deps = ["UUIDs"]
git-tree-sha1 = "838a3a4188e2ded87a4f9f184b4b0d78a1e91cb7"
uuid = "ae029012-a4dd-5104-9daa-d747884805df"
version = "1.3.0"
[[deps.SHA]]
uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
version = "0.7.0"
[[deps.Scratch]]
deps = ["Dates"]
git-tree-sha1 = "3bac05bc7e74a75fd9cba4295cde4045d9fe2386"
uuid = "6c6a2e73-6563-6170-7368-637461726353"
version = "1.2.1"
[[deps.SentinelArrays]]
deps = ["Dates", "Random"]
git-tree-sha1 = "0e7508ff27ba32f26cd459474ca2ede1bc10991f"
uuid = "91c51154-3ec4-41a3-a24f-3f23e20d615c"
version = "1.4.1"
[[deps.Serialization]]
uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
[[deps.Sockets]]
uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
[[deps.SortingAlgorithms]]
deps = ["DataStructures"]
git-tree-sha1 = "66e0a8e672a0bdfca2c3f5937efb8538b9ddc085"
uuid = "a2af1166-a08f-5f64-846c-94a0d3cef48c"
version = "1.2.1"
[[deps.SparseArrays]]
deps = ["Libdl", "LinearAlgebra", "Random", "Serialization", "SuiteSparse_jll"]
uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
version = "1.10.0"
[[deps.StaticArrays]]
deps = ["LinearAlgebra", "PrecompileTools", "Random", "StaticArraysCore"]
git-tree-sha1 = "bf074c045d3d5ffd956fa0a461da38a44685d6b2"
uuid = "90137ffa-7385-5640-81b9-e52037218182"
version = "1.9.3"
[deps.StaticArrays.extensions]
StaticArraysChainRulesCoreExt = "ChainRulesCore"
StaticArraysStatisticsExt = "Statistics"
[deps.StaticArrays.weakdeps]
ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
[[deps.StaticArraysCore]]
git-tree-sha1 = "36b3d696ce6366023a0ea192b4cd442268995a0d"
uuid = "1e83bf80-4336-4d27-bf5d-d5a4f845583c"
version = "1.4.2"
[[deps.Statistics]]
deps = ["LinearAlgebra", "SparseArrays"]
uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
version = "1.10.0"
[[deps.StringManipulation]]
deps = ["PrecompileTools"]
git-tree-sha1 = "a04cabe79c5f01f4d723cc6704070ada0b9d46d5"
uuid = "892a3eda-7b42-436c-8928-eab12a02cf0e"
version = "0.3.4"
[[deps.SuiteSparse_jll]]
deps = ["Artifacts", "Libdl", "libblastrampoline_jll"]
uuid = "bea87d4a-7f5b-5778-9afe-8cc45184846c"
version = "7.2.1+1"
[[deps.TOML]]
deps = ["Dates"]
uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
version = "1.0.3"
[[deps.TableTraits]]
deps = ["IteratorInterfaceExtensions"]
git-tree-sha1 = "c06b2f539df1c6efa794486abfb6ed2022561a39"
uuid = "3783bdb8-4a98-5b6b-af9a-565f29a5fe9c"
version = "1.0.1"
[[deps.Tables]]
deps = ["DataAPI", "DataValueInterfaces", "IteratorInterfaceExtensions", "LinearAlgebra", "OrderedCollections", "TableTraits"]
git-tree-sha1 = "cb76cf677714c095e535e3501ac7954732aeea2d"
uuid = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
version = "1.11.1"
[[deps.Tar]]
deps = ["ArgTools", "SHA"]
uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
version = "1.10.0"
[[deps.Test]]
deps = ["InteractiveUtils", "Logging", "Random", "Serialization"]
uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
[[deps.TimerOutputs]]
deps = ["ExprTools", "Printf"]
git-tree-sha1 = "f548a9e9c490030e545f72074a41edfd0e5bcdd7"
uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
version = "0.5.23"
[[deps.UUIDs]]
deps = ["Random", "SHA"]
uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
[[deps.Unicode]]
uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
[[deps.UnsafeAtomics]]
git-tree-sha1 = "6331ac3440856ea1988316b46045303bef658278"
uuid = "013be700-e6cd-48c3-b4a1-df204f14c38f"
version = "0.2.1"
[[deps.UnsafeAtomicsLLVM]]
deps = ["LLVM", "UnsafeAtomics"]
git-tree-sha1 = "323e3d0acf5e78a56dfae7bd8928c989b4f3083e"
uuid = "d80eeb9a-aca5-4d75-85e5-170c8b632249"
version = "0.1.3"
[[deps.Zlib_jll]]
deps = ["Libdl"]
uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
version = "1.2.13+1"
[[deps.libblastrampoline_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "8e850b90-86db-534c-a0d3-1478176c7d93"
version = "5.8.0+1"
[[deps.nghttp2_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d"
version = "1.52.0+1"
[[deps.p7zip_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0"
version = "17.4.0+2"
Expected behavior
The second multiplication also passes.
Version info
Details on Julia:
Julia Version 1.10.2
Commit bd47eca2c8a (2024-03-01 10:14 UTC)
Build Info:
Official https://julialang.org/ release
Platform Info:
OS: Linux (x86_64-linux-gnu)
CPU: 128 × Intel(R) Xeon(R) Platinum 8462Y+
WORD_SIZE: 64
LIBM: libopenlibm
LLVM: libLLVM-15.0.7 (ORCJIT, sapphirerapids)
Threads: 128 default, 0 interactive, 64 GC (on 128 virtual cores)
Details on CUDA:
CUDA runtime 12.3, artifact installation
CUDA driver 12.4
NVIDIA driver 550.54.15
CUDA libraries:
- CUBLAS: 12.3.4
- CURAND: 10.3.4
- CUFFT: 11.0.12
- CUSOLVER: 11.5.4
- CUSPARSE: 12.2.0
- CUPTI: 21.0.0
- NVML: 12.0.0+550.54.15
Julia packages:
- CUDA: 5.2.0
- CUDA_Driver_jll: 0.7.0+1
- CUDA_Runtime_jll: 0.11.1+0
Toolchain:
- Julia: 1.10.2
- LLVM: 15.0.7
4 devices:
0: NVIDIA H100 (sm_90, 92.999 GiB / 93.584 GiB available)
1: NVIDIA H100 (sm_90, 92.999 GiB / 93.584 GiB available)
2: NVIDIA H100 (sm_90, 92.999 GiB / 93.584 GiB available)
3: NVIDIA H100 (sm_90, 92.999 GiB / 93.584 GiB available)
Additional context
Similar and larger examples (up to 500k) work in C++ on the same setup.
Works for me.
Similar and larger examples (up to 500k) work in C++ on the same setup.
What version of the CUDA toolkit is that using? Since you're using a CUDA 12.4 driver, I assume that might be the CUDA 12.4 toolkit. If so, try CUDA.jl#master to also use CUDA 12.4 for the Julia test.
On master
I still get the same error.
CUDA runtime 12.4, artifact installation
CUDA driver 12.4
NVIDIA driver 550.54.15
CUDA libraries:
- CUBLAS: 12.4.2
- CURAND: 10.3.5
- CUFFT: 11.2.0
- CUSOLVER: 11.6.0
- CUSPARSE: 12.3.0
- CUPTI: 22.0.0
- NVML: 12.0.0+550.54.15
Julia packages:
- CUDA: 5.3.0
- CUDA_Driver_jll: 0.8.0+0
- CUDA_Runtime_jll: 0.12.0+1
Toolchain:
- Julia: 1.10.2
- LLVM: 15.0.7
4 devices:
0: NVIDIA H100 (sm_90, 92.999 GiB / 93.584 GiB available)
1: NVIDIA H100 (sm_90, 92.999 GiB / 93.584 GiB available)
2: NVIDIA H100 (sm_90, 92.999 GiB / 93.584 GiB available)
3: NVIDIA H100 (sm_90, 92.999 GiB / 93.584 GiB available)
I also tried on a different machine (both 5.2 and master, versioninfo below). The error still happens.
5.2
CUDA runtime 12.3, artifact installation
CUDA driver 12.3
NVIDIA driver 545.23.6
CUDA libraries:
- CUBLAS: 12.3.4
- CURAND: 10.3.4
- CUFFT: 11.0.12
- CUSOLVER: 11.5.4
- CUSPARSE: 12.2.0
- CUPTI: 21.0.0
- NVML: 12.0.0+545.23.6
Julia packages:
- CUDA: 5.2.0
- CUDA_Driver_jll: 0.7.0+1
- CUDA_Runtime_jll: 0.11.1+0
Toolchain:
- Julia: 1.10.2
- LLVM: 15.0.7
2 devices:
0: NVIDIA RTX A6000 (sm_86, 44.548 GiB / 44.988 GiB available)
1: NVIDIA RTX A6000 (sm_86, 44.548 GiB / 44.988 GiB available)
master
CUDA runtime 12.4, artifact installation
CUDA driver 12.3
NVIDIA driver 545.23.6
CUDA libraries:
- CUBLAS: 12.4.2
- CURAND: 10.3.5
- CUFFT: 11.2.0
- CUSOLVER: 11.6.0
- CUSPARSE: 12.3.0
- CUPTI: 22.0.0
- NVML: 12.0.0+545.23.6
Julia packages:
- CUDA: 5.3.0
- CUDA_Driver_jll: 0.8.0+0
- CUDA_Runtime_jll: 0.12.0+1
Toolchain:
- Julia: 1.10.2
- LLVM: 15.0.7
2 devices:
0: NVIDIA RTX A6000 (sm_86, 44.548 GiB / 44.988 GiB available)
1: NVIDIA RTX A6000 (sm_86, 44.548 GiB / 44.988 GiB available)
That's surprising, because I also have an RTX A6000. I can reproduce on an H100 though.
Can you share your C++ reproducer?
Also, can you try running with JULIA_DEBUG=CUBLAS
?
On the H100 machine on master
09:32:08 |base|lpawela@nirvana mwe_xt_gemm → JULIA_DEBUG=CUBLAS julia --project -t 128 mwe.jl
First test passed
┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasXtCreate(cublasXtContext**) called:
│ handle: type=SOME TYPE; val=POINTER (IN HEX:0x0x7f9324ac5dc0)
│ Time: 2024-04-10T21:32:37 elapsed from start 0.100000 minutes or 6.000000 seconds
│ Process=414562; Thread=140270086828608; GPU=0; Handle=POINTER (IN HEX:0x(nil))
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
ERROR: ┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasXtDeviceSelect(cublasXtHandle_t, int, int*) called:
│ handle: type=SOME TYPE; val=POINTER (IN HEX:0x0x618e950)
│ nbDevices: type=int; val=4
│ deviceId: type=int; val=POINTER (IN HEX:0x0x7f9321955260)
│ Time: 2024-04-10T21:32:37 elapsed from start 0.100000 minutes or 6.000000 seconds
│ Process=414562; Thread=140270086828608; GPU=0; Handle=POINTER (IN HEX:0x(nil))
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
LoadError: ┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasCreate_v2(cublasContext**) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x61cbcd0)
│ Time: 2024-04-10T21:32:37 elapsed from start 0.100000 minutes or 6.000000 seconds
│ Process=414562; Thread=140270086828608; GPU=0; Handle=POINTER (IN HEX:0x(nil))
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
CUBLASError: ┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasCreate_v2(cublasContext**) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x61cbcd8)
│ Time: 2024-04-10T21:32:37 elapsed from start 0.100000 minutes or 6.000000 seconds
│ Process=414562; Thread=140270086828608; GPU=1; Handle=POINTER (IN HEX:0x(nil))
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
an access to GPU memory space failed┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasCreate_v2(cublasContext**) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x61cbce0)
│ Time: 2024-04-10T21:32:37 elapsed from start 0.100000 minutes or 6.000000 seconds
│ Process=414562; Thread=140270086828608; GPU=2; Handle=POINTER (IN HEX:0x(nil))
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
(code 11, CUBLAS_STATUS_MAPPING_ERROR)┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasCreate_v2(cublasContext**) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x61cbce8)
│ Time: 2024-04-10T21:32:37 elapsed from start 0.100000 minutes or 6.000000 seconds
│ Process=414562; Thread=140270086828608; GPU=3; Handle=POINTER (IN HEX:0x(nil))
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
Stacktrace:┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasXtSgemm(cublasXtHandle_t, cublasOperation_t, cublasOperation_t, size_t, size_t, size_t, const float*, const float*, size_t, const float*, size_t, const float*, float*, size_t) called:
│ handle: type=SOME TYPE; val=POINTER (IN HEX:0x0x618e950)
│ transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ m: type=SOME TYPE; val=1000
│ n: type=SOME TYPE; val=100
│ k: type=SOME TYPE; val=1000
│ alpha: type=float; val=POINTER (IN HEX:0x0x7ffd86ce35f8)
│ A: type=float; val=POINTER (IN HEX:0x0x7f8c6c999040)
│ lda: type=SOME TYPE; val=1000
│ B: type=float; val=POINTER (IN HEX:0x0x620000000)
│ ldb: type=SOME TYPE; val=1000
│ beta: type=float; val=POINTER (IN HEX:0x0x7ffd86ce35f0)
│ C: type=float; val=POINTER (IN HEX:0x0x620061c00)
│ ldc: type=SOME TYPE; val=1000
│ Time: 2024-04-10T21:32:37 elapsed from start 0.100000 minutes or 6.000000 seconds
│ Process=414562; Thread=140270086828608; GPU=0; Handle=POINTER (IN HEX:0x(nil))
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasSetStream_v2(cublasHandle_t, cudaStream_t) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│ streamId: type=SOME TYPE; val=POINTER (IN HEX:0x0x7f899c0050f0)
│ Time: 2024-04-10T21:32:37 elapsed from start 0.100000 minutes or 6.000000 seconds
│ Process=414562; Thread=140230656906816; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x(nil)) (defaultStream); MathMode=CUBLAS_DEFAULT_MATH
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasSgemm_v2(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float*, const float*, int, const float*, int, const float*, float*, int) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│ transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ m: type=int; val=1000
│ n: type=int; val=100
│ k: type=int; val=1000
│ alpha: type=float; val=POINTER (IN HEX:0x0x7f89fe7dca38)
│ A: type=float; val=POINTER (IN HEX:0x0x7f89d1200000)
│ lda: type=int; val=1024
│ B: type=float; val=POINTER (IN HEX:0x0x620000000)
│ ldb: type=int; val=1000
│ beta: type=float; val=POINTER (IN HEX:0x0x7f89fe7dca34)
│ C: type=float; val=POINTER (IN HEX:0x0x620061c00)
│ ldc: type=int; val=1000
│ Time: 2024-04-10T21:32:37 elapsed from start 0.100000 minutes or 6.000000 seconds
│ Process=414562; Thread=140230656906816; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899c0050f0); MathMode=CUBLAS_DEFAULT_MATH
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
│
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
[1]┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasXtSgemm(cublasXtHandle_t, cublasOperation_t, cublasOperation_t, size_t, size_t, size_t, const float*, const float*, size_t, const float*, size_t, const float*, float*, size_t) called:
│ handle: type=SOME TYPE; val=POINTER (IN HEX:0x0x618e950)
│ transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ m: type=SOME TYPE; val=100000
│ n: type=SOME TYPE; val=10000
│ k: type=SOME TYPE; val=100000
│ alpha: type=float; val=POINTER (IN HEX:0x0x7ffd86ce35f8)
│ A: type=float; val=POINTER (IN HEX:0x0x7f8030f62040)
│ lda: type=SOME TYPE; val=100000
│ B: type=float; val=POINTER (IN HEX:0x0x6200c3800)
│ ldb: type=SOME TYPE; val=100000
│ beta: type=float; val=POINTER (IN HEX:0x0x7ffd86ce35f0)
│ C: type=float; val=POINTER (IN HEX:0x0x70e776000)
│ ldc: type=SOME TYPE; val=100000
│ Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140270086828608; GPU=0; Handle=POINTER (IN HEX:0x(nil))
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasSetStream_v2(cublasHandle_t, cudaStream_t) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│ streamId: type=SOME TYPE; val=POINTER (IN HEX:0x0x7f899f21b480)
│ Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899c0050f0); MathMode=CUBLAS_DEFAULT_MATH
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
throw_api_error(res::CUDA.CUBLAS.cublasStatus_t)┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasSgemm_v2(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float*, const float*, int, const float*, int, const float*, float*, int) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│ transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ m: type=int; val=1024
│ n: type=int; val=1024
│ k: type=int; val=1024
│ alpha: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a38)
│ A: type=float; val=POINTER (IN HEX:0x0x7f898a400000)
│ lda: type=int; val=1024
│ B: type=float; val=POINTER (IN HEX:0x0x6200c3800)
│ ldb: type=int; val=100000
│ beta: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a34)
│ C: type=float; val=POINTER (IN HEX:0x0x70e776000)
│ ldc: type=int; val=100000
│ Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899f21b480); MathMode=CUBLAS_DEFAULT_MATH
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasSetStream_v2(cublasHandle_t, cudaStream_t) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│ streamId: type=SOME TYPE; val=POINTER (IN HEX:0x0x7f899c3acd00)
│ Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899f21b480); MathMode=CUBLAS_DEFAULT_MATH
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
@┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasSgemm_v2(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float*, const float*, int, const float*, int, const float*, float*, int) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│ transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ m: type=int; val=1024
│ n: type=int; val=1024
│ k: type=int; val=1024
│ alpha: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a38)
│ A: type=float; val=POINTER (IN HEX:0x0x7f8993a00000)
│ lda: type=int; val=1024
│ B: type=float; val=POINTER (IN HEX:0x0x6200c4800)
│ ldb: type=int; val=100000
│ beta: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a3c)
│ C: type=float; val=POINTER (IN HEX:0x0x70e776000)
│ ldc: type=int; val=100000
│ Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899c3acd00); MathMode=CUBLAS_DEFAULT_MATH
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasSetStream_v2(cublasHandle_t, cudaStream_t) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│ streamId: type=SOME TYPE; val=POINTER (IN HEX:0x0x7f899f21b480)
│ Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899c3acd00); MathMode=CUBLAS_DEFAULT_MATH
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
CUDA.CUBLAS┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasSgemm_v2(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float*, const float*, int, const float*, int, const float*, float*, int) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│ transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ m: type=int; val=1024
│ n: type=int; val=1024
│ k: type=int; val=1024
│ alpha: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a38)
│ A: type=float; val=POINTER (IN HEX:0x0x7f898a400000)
│ lda: type=int; val=1024
│ B: type=float; val=POINTER (IN HEX:0x0x6200c5800)
│ ldb: type=int; val=100000
│ beta: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a3c)
│ C: type=float; val=POINTER (IN HEX:0x0x70e776000)
│ ldc: type=int; val=100000
│ Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899f21b480); MathMode=CUBLAS_DEFAULT_MATH
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasSetStream_v2(cublasHandle_t, cudaStream_t) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│ streamId: type=SOME TYPE; val=POINTER (IN HEX:0x0x7f899c3acd00)
│ Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899f21b480); MathMode=CUBLAS_DEFAULT_MATH
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
~/.julia/packages/CUDA/fGE8R/lib/cublas/┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasSgemm_v2(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float*, const float*, int, const float*, int, const float*, float*, int) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│ transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ m: type=int; val=1024
│ n: type=int; val=1024
│ k: type=int; val=1024
│ alpha: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a38)
│ A: type=float; val=POINTER (IN HEX:0x0x7f8993a00000)
│ lda: type=int; val=1024
│ B: type=float; val=POINTER (IN HEX:0x0x6200c6800)
│ ldb: type=int; val=100000
│ beta: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a3c)
│ C: type=float; val=POINTER (IN HEX:0x0x70e776000)
│ ldc: type=int; val=100000
│ Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899c3acd00); MathMode=CUBLAS_DEFAULT_MATH
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
libcublas.jl:14┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasSetStream_v2(cublasHandle_t, cudaStream_t) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│ streamId: type=SOME TYPE; val=POINTER (IN HEX:0x0x7f899f21b480)
│ Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899c3acd00); MathMode=CUBLAS_DEFAULT_MATH
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasSgemm_v2(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float*, const float*, int, const float*, int, const float*, float*, int) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│ transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ m: type=int; val=1024
│ n: type=int; val=1024
│ k: type=int; val=1024
│ alpha: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a38)
│ A: type=float; val=POINTER (IN HEX:0x0x7f898a400000)
│ lda: type=int; val=1024
│ B: type=float; val=POINTER (IN HEX:0x0x6200c7800)
│ ldb: type=int; val=100000
│ beta: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a3c)
│ C: type=float; val=POINTER (IN HEX:0x0x70e776000)
│ ldc: type=int; val=100000
│ Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899f21b480); MathMode=CUBLAS_DEFAULT_MATH
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasSetStream_v2(cublasHandle_t, cudaStream_t) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│ streamId: type=SOME TYPE; val=POINTER (IN HEX:0x0x7f899c3acd00)
│ Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899f21b480); MathMode=CUBLAS_DEFAULT_MATH
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasSgemm_v2(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float*, const float*, int, const float*, int, const float*, float*, int) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│ transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ m: type=int; val=1024
│ n: type=int; val=1024
│ k: type=int; val=1024
│ alpha: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a38)
│ A: type=float; val=POINTER (IN HEX:0x0x7f8993a00000)
│ lda: type=int; val=1024
│ B: type=float; val=POINTER (IN HEX:0x0x6200c8800)
│ ldb: type=int; val=100000
│ beta: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a3c)
│ C: type=float; val=POINTER (IN HEX:0x0x70e776000)
│ ldc: type=int; val=100000
│ Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899c3acd00); MathMode=CUBLAS_DEFAULT_MATH
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
[2]┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasSetStream_v2(cublasHandle_t, cudaStream_t) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│ streamId: type=SOME TYPE; val=POINTER (IN HEX:0x0x7f899f21b480)
│ Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899c3acd00); MathMode=CUBLAS_DEFAULT_MATH
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasSgemm_v2(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float*, const float*, int, const float*, int, const float*, float*, int) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│ transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ m: type=int; val=1024
│ n: type=int; val=1024
│ k: type=int; val=1024
│ alpha: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a38)
│ A: type=float; val=POINTER (IN HEX:0x0x7f898a400000)
│ lda: type=int; val=1024
│ B: type=float; val=POINTER (IN HEX:0x0x6200c9800)
│ ldb: type=int; val=100000
│ beta: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a3c)
│ C: type=float; val=POINTER (IN HEX:0x0x70e776000)
│ ldc: type=int; val=100000
│ Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899f21b480); MathMode=CUBLAS_DEFAULT_MATH
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
check┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasSetStream_v2(cublasHandle_t, cudaStream_t) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│ streamId: type=SOME TYPE; val=POINTER (IN HEX:0x0x7f899c3acd00)
│ Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899f21b480); MathMode=CUBLAS_DEFAULT_MATH
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasSgemm_v2(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float*, const float*, int, const float*, int, const float*, float*, int) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│ transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ m: type=int; val=1024
│ n: type=int; val=1024
│ k: type=int; val=1024
│ alpha: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a38)
│ A: type=float; val=POINTER (IN HEX:0x0x7f8993a00000)
│ lda: type=int; val=1024
│ B: type=float; val=POINTER (IN HEX:0x0x6200ca800)
│ ldb: type=int; val=100000
│ beta: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a3c)
│ C: type=float; val=POINTER (IN HEX:0x0x70e776000)
│ ldc: type=int; val=100000
│ Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899c3acd00); MathMode=CUBLAS_DEFAULT_MATH
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
@┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasSetStream_v2(cublasHandle_t, cudaStream_t) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│ streamId: type=SOME TYPE; val=POINTER (IN HEX:0x0x7f899f21b480)
│ Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899c3acd00); MathMode=CUBLAS_DEFAULT_MATH
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasSgemm_v2(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float*, const float*, int, const float*, int, const float*, float*, int) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│ transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ m: type=int; val=1024
│ n: type=int; val=1024
│ k: type=int; val=1024
│ alpha: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a38)
│ A: type=float; val=POINTER (IN HEX:0x0x7f898a400000)
│ lda: type=int; val=1024
│ B: type=float; val=POINTER (IN HEX:0x0x6200cb800)
│ ldb: type=int; val=100000
│ beta: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a3c)
│ C: type=float; val=POINTER (IN HEX:0x0x70e776000)
│ ldc: type=int; val=100000
│ Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899f21b480); MathMode=CUBLAS_DEFAULT_MATH
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
~/.julia/packages/CUDA/fGE8R/lib/cublas/┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasSetStream_v2(cublasHandle_t, cudaStream_t) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│ streamId: type=SOME TYPE; val=POINTER (IN HEX:0x0x7f899c3acd00)
│ Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899f21b480); MathMode=CUBLAS_DEFAULT_MATH
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
libcublas.jl:27┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasSgemm_v2(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float*, const float*, int, const float*, int, const float*, float*, int) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│ transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ m: type=int; val=1024
│ n: type=int; val=1024
│ k: type=int; val=1024
│ alpha: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a38)
│ A: type=float; val=POINTER (IN HEX:0x0x7f8993a00000)
│ lda: type=int; val=1024
│ B: type=float; val=POINTER (IN HEX:0x0x6200cc800)
│ ldb: type=int; val=100000
│ beta: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a3c)
│ C: type=float; val=POINTER (IN HEX:0x0x70e776000)
│ ldc: type=int; val=100000
│ Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899c3acd00); MathMode=CUBLAS_DEFAULT_MATH
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
[inlined]┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasSetStream_v2(cublasHandle_t, cudaStream_t) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│ streamId: type=SOME TYPE; val=POINTER (IN HEX:0x0x7f899f21b480)
│ Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899c3acd00); MathMode=CUBLAS_DEFAULT_MATH
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasSgemm_v2(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float*, const float*, int, const float*, int, const float*, float*, int) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│ transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ m: type=int; val=1024
│ n: type=int; val=1024
│ k: type=int; val=1024
│ alpha: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a38)
│ A: type=float; val=POINTER (IN HEX:0x0x7f898a400000)
│ lda: type=int; val=1024
│ B: type=float; val=POINTER (IN HEX:0x0x6200cd800)
│ ldb: type=int; val=100000
│ beta: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a3c)
│ C: type=float; val=POINTER (IN HEX:0x0x70e776000)
│ ldc: type=int; val=100000
│ Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899f21b480); MathMode=CUBLAS_DEFAULT_MATH
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasSetStream_v2(cublasHandle_t, cudaStream_t) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│ streamId: type=SOME TYPE; val=POINTER (IN HEX:0x0x7f899c3acd00)
│ Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899f21b480); MathMode=CUBLAS_DEFAULT_MATH
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
[3]┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasSgemm_v2(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float*, const float*, int, const float*, int, const float*, float*, int) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│ transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ m: type=int; val=1024
│ n: type=int; val=1024
│ k: type=int; val=1024
│ alpha: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a38)
│ A: type=float; val=POINTER (IN HEX:0x0x7f8993a00000)
│ lda: type=int; val=1024
│ B: type=float; val=POINTER (IN HEX:0x0x6200ce800)
│ ldb: type=int; val=100000
│ beta: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a3c)
│ C: type=float; val=POINTER (IN HEX:0x0x70e776000)
│ ldc: type=int; val=100000
│ Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899c3acd00); MathMode=CUBLAS_DEFAULT_MATH
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasSetStream_v2(cublasHandle_t, cudaStream_t) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│ streamId: type=SOME TYPE; val=POINTER (IN HEX:0x0x7f899f21b480)
│ Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899c3acd00); MathMode=CUBLAS_DEFAULT_MATH
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
cublasXtSgemm┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasSgemm_v2(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float*, const float*, int, const float*, int, const float*, float*, int) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│ transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ m: type=int; val=1024
│ n: type=int; val=1024
│ k: type=int; val=1024
│ alpha: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a38)
│ A: type=float; val=POINTER (IN HEX:0x0x7f898a400000)
│ lda: type=int; val=1024
│ B: type=float; val=POINTER (IN HEX:0x0x6200cf800)
│ ldb: type=int; val=100000
│ beta: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a3c)
│ C: type=float; val=POINTER (IN HEX:0x0x70e776000)
│ ldc: type=int; val=100000
│ Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899f21b480); MathMode=CUBLAS_DEFAULT_MATH
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasSetStream_v2(cublasHandle_t, cudaStream_t) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│ streamId: type=SOME TYPE; val=POINTER (IN HEX:0x0x7f899c3acd00)
│ Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899f21b480); MathMode=CUBLAS_DEFAULT_MATH
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
@┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasSgemm_v2(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float*, const float*, int, const float*, int, const float*, float*, int) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│ transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ m: type=int; val=1024
│ n: type=int; val=1024
│ k: type=int; val=1024
│ alpha: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a38)
│ A: type=float; val=POINTER (IN HEX:0x0x7f8993a00000)
│ lda: type=int; val=1024
│ B: type=float; val=POINTER (IN HEX:0x0x6200d0800)
│ ldb: type=int; val=100000
│ beta: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a3c)
│ C: type=float; val=POINTER (IN HEX:0x0x70e776000)
│ ldc: type=int; val=100000
│ Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899c3acd00); MathMode=CUBLAS_DEFAULT_MATH
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasSetStream_v2(cublasHandle_t, cudaStream_t) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│ streamId: type=SOME TYPE; val=POINTER (IN HEX:0x0x7f899f21b480)
│ Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899c3acd00); MathMode=CUBLAS_DEFAULT_MATH
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
~/.julia/packages/CUDA/fGE8R/lib/utils/┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasSgemm_v2(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float*, const float*, int, const float*, int, const float*, float*, int) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│ transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ m: type=int; val=1024
│ n: type=int; val=1024
│ k: type=int; val=1024
│ alpha: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a38)
│ A: type=float; val=POINTER (IN HEX:0x0x7f898a400000)
│ lda: type=int; val=1024
│ B: type=float; val=POINTER (IN HEX:0x0x6200d1800)
│ ldb: type=int; val=100000
│ beta: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a3c)
│ C: type=float; val=POINTER (IN HEX:0x0x70e776000)
│ ldc: type=int; val=100000
│ Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899f21b480); MathMode=CUBLAS_DEFAULT_MATH
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
call.jl:30┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasSetStream_v2(cublasHandle_t, cudaStream_t) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│ streamId: type=SOME TYPE; val=POINTER (IN HEX:0x0x7f899c3acd00)
│ Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899f21b480); MathMode=CUBLAS_DEFAULT_MATH
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
[inlined]┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasSgemm_v2(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float*, const float*, int, const float*, int, const float*, float*, int) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│ transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ m: type=int; val=1024
│ n: type=int; val=1024
│ k: type=int; val=1024
│ alpha: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a38)
│ A: type=float; val=POINTER (IN HEX:0x0x7f8993a00000)
│ lda: type=int; val=1024
│ B: type=float; val=POINTER (IN HEX:0x0x6200d2800)
│ ldb: type=int; val=100000
│ beta: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a3c)
│ C: type=float; val=POINTER (IN HEX:0x0x70e776000)
│ ldc: type=int; val=100000
│ Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899c3acd00); MathMode=CUBLAS_DEFAULT_MATH
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasSetStream_v2(cublasHandle_t, cudaStream_t) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│ streamId: type=SOME TYPE; val=POINTER (IN HEX:0x0x7f899f21b480)
│ Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899c3acd00); MathMode=CUBLAS_DEFAULT_MATH
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasSgemm_v2(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float*, const float*, int, const float*, int, const float*, float*, int) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│ transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ m: type=int; val=1024
│ n: type=int; val=1024
│ k: type=int; val=1024
│ alpha: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a38)
│ A: type=float; val=POINTER (IN HEX:0x0x7f898a400000)
│ lda: type=int; val=1024
│ B: type=float; val=POINTER (IN HEX:0x0x6200d3800)
│ ldb: type=int; val=100000
│ beta: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a3c)
│ C: type=float; val=POINTER (IN HEX:0x0x70e776000)
│ ldc: type=int; val=100000
│ Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899f21b480); MathMode=CUBLAS_DEFAULT_MATH
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
[4]┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasSetStream_v2(cublasHandle_t, cudaStream_t) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│ streamId: type=SOME TYPE; val=POINTER (IN HEX:0x0x7f899c3acd00)
│ Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899f21b480); MathMode=CUBLAS_DEFAULT_MATH
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasSgemm_v2(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float*, const float*, int, const float*, int, const float*, float*, int) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│ transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ m: type=int; val=1024
│ n: type=int; val=1024
│ k: type=int; val=1024
│ alpha: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a38)
│ A: type=float; val=POINTER (IN HEX:0x0x7f8993a00000)
│ lda: type=int; val=1024
│ B: type=float; val=POINTER (IN HEX:0x0x6200d4800)
│ ldb: type=int; val=100000
│ beta: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a3c)
│ C: type=float; val=POINTER (IN HEX:0x0x70e776000)
│ ldc: type=int; val=100000
│ Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899c3acd00); MathMode=CUBLAS_DEFAULT_MATH
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
xt_gemm!(transA::Char, transB::Char, alpha::Int64, A::Matrix{Float32}, B::CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, beta::Int64, C::CuArray{Float32, 2, CUDA.Mem.DeviceBuffer})┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasSetStream_v2(cublasHandle_t, cudaStream_t) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│ streamId: type=SOME TYPE; val=POINTER (IN HEX:0x0x7f899f21b480)
│ Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899c3acd00); MathMode=CUBLAS_DEFAULT_MATH
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasSgemm_v2(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float*, const float*, int, const float*, int, const float*, float*, int) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│ transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ m: type=int; val=1024
│ n: type=int; val=1024
│ k: type=int; val=1024
│ alpha: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a38)
│ A: type=float; val=POINTER (IN HEX:0x0x7f898a400000)
│ lda: type=int; val=1024
│ B: type=float; val=POINTER (IN HEX:0x0x6200d5800)
│ ldb: type=int; val=100000
│ beta: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a3c)
│ C: type=float; val=POINTER (IN HEX:0x0x70e776000)
│ ldc: type=int; val=100000
│ Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899f21b480); MathMode=CUBLAS_DEFAULT_MATH
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
@┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasSetStream_v2(cublasHandle_t, cudaStream_t) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│ streamId: type=SOME TYPE; val=POINTER (IN HEX:0x0x7f899c3acd00)
│ Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899f21b480); MathMode=CUBLAS_DEFAULT_MATH
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasSgemm_v2(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float*, const float*, int, const float*, int, const float*, float*, int) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│ transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ m: type=int; val=1024
│ n: type=int; val=1024
│ k: type=int; val=1024
│ alpha: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a38)
│ A: type=float; val=POINTER (IN HEX:0x0x7f8993a00000)
│ lda: type=int; val=1024
│ B: type=float; val=POINTER (IN HEX:0x0x6200d6800)
│ ldb: type=int; val=100000
│ beta: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a3c)
│ C: type=float; val=POINTER (IN HEX:0x0x70e776000)
│ ldc: type=int; val=100000
│ Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899c3acd00); MathMode=CUBLAS_DEFAULT_MATH
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
CUDA.CUBLAS┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasSetStream_v2(cublasHandle_t, cudaStream_t) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│ streamId: type=SOME TYPE; val=POINTER (IN HEX:0x0x7f899f21b480)
│ Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899c3acd00); MathMode=CUBLAS_DEFAULT_MATH
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasSgemm_v2(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float*, const float*, int, const float*, int, const float*, float*, int) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│ transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ m: type=int; val=1024
│ n: type=int; val=1024
│ k: type=int; val=1024
│ alpha: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a38)
│ A: type=float; val=POINTER (IN HEX:0x0x7f898a400000)
│ lda: type=int; val=1024
│ B: type=float; val=POINTER (IN HEX:0x0x6200d7800)
│ ldb: type=int; val=100000
│ beta: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a3c)
│ C: type=float; val=POINTER (IN HEX:0x0x70e776000)
│ ldc: type=int; val=100000
│ Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899f21b480); MathMode=CUBLAS_DEFAULT_MATH
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
~/.julia/packages/CUDA/fGE8R/lib/cublas/┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasSetStream_v2(cublasHandle_t, cudaStream_t) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│ streamId: type=SOME TYPE; val=POINTER (IN HEX:0x0x7f899c3acd00)
│ Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899f21b480); MathMode=CUBLAS_DEFAULT_MATH
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
wrappers.jl:2145┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasSgemm_v2(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float*, const float*, int, const float*, int, const float*, float*, int) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│ transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ m: type=int; val=1024
│ n: type=int; val=1024
│ k: type=int; val=1024
│ alpha: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a38)
│ A: type=float; val=POINTER (IN HEX:0x0x7f8993a00000)
│ lda: type=int; val=1024
│ B: type=float; val=POINTER (IN HEX:0x0x6200d8800)
│ ldb: type=int; val=100000
│ beta: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a3c)
│ C: type=float; val=POINTER (IN HEX:0x0x70e776000)
│ ldc: type=int; val=100000
│ Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899c3acd00); MathMode=CUBLAS_DEFAULT_MATH
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasSetStream_v2(cublasHandle_t, cudaStream_t) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│ streamId: type=SOME TYPE; val=POINTER (IN HEX:0x0x7f899f21b480)
│ Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899c3acd00); MathMode=CUBLAS_DEFAULT_MATH
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasSgemm_v2(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float*, const float*, int, const float*, int, const float*, float*, int) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│ transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ m: type=int; val=1024
│ n: type=int; val=1024
│ k: type=int; val=1024
│ alpha: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a38)
│ A: type=float; val=POINTER (IN HEX:0x0x7f898a400000)
│ lda: type=int; val=1024
│ B: type=float; val=POINTER (IN HEX:0x0x6200d9800)
│ ldb: type=int; val=100000
│ beta: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a3c)
│ C: type=float; val=POINTER (IN HEX:0x0x70e776000)
│ ldc: type=int; val=100000
│ Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899f21b480); MathMode=CUBLAS_DEFAULT_MATH
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasSetStream_v2(cublasHandle_t, cudaStream_t) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│ streamId: type=SOME TYPE; val=POINTER (IN HEX:0x0x7f899c3acd00)
│ Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899f21b480); MathMode=CUBLAS_DEFAULT_MATH
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
[5]┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasSgemm_v2(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float*, const float*, int, const float*, int, const float*, float*, int) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│ transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ m: type=int; val=1024
│ n: type=int; val=1024
│ k: type=int; val=1024
│ alpha: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a38)
│ A: type=float; val=POINTER (IN HEX:0x0x7f8993a00000)
│ lda: type=int; val=1024
│ B: type=float; val=POINTER (IN HEX:0x0x6200da800)
│ ldb: type=int; val=100000
│ beta: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a3c)
│ C: type=float; val=POINTER (IN HEX:0x0x70e776000)
│ ldc: type=int; val=100000
│ Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899c3acd00); MathMode=CUBLAS_DEFAULT_MATH
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasSetStream_v2(cublasHandle_t, cudaStream_t) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│ streamId: type=SOME TYPE; val=POINTER (IN HEX:0x0x7f899f21b480)
│ Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899c3acd00); MathMode=CUBLAS_DEFAULT_MATH
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
top-level scope┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasSgemm_v2(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float*, const float*, int, const float*, int, const float*, float*, int) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│ transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ m: type=int; val=1024
│ n: type=int; val=1024
│ k: type=int; val=1024
│ alpha: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a38)
│ A: type=float; val=POINTER (IN HEX:0x0x7f898a400000)
│ lda: type=int; val=1024
│ B: type=float; val=POINTER (IN HEX:0x0x6200db800)
│ ldb: type=int; val=100000
│ beta: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a3c)
│ C: type=float; val=POINTER (IN HEX:0x0x70e776000)
│ ldc: type=int; val=100000
│ Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899f21b480); MathMode=CUBLAS_DEFAULT_MATH
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasSetStream_v2(cublasHandle_t, cudaStream_t) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│ streamId: type=SOME TYPE; val=POINTER (IN HEX:0x0x7f899c3acd00)
│ Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899f21b480); MathMode=CUBLAS_DEFAULT_MATH
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
@┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasSgemm_v2(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float*, const float*, int, const float*, int, const float*, float*, int) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│ transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ m: type=int; val=1024
│ n: type=int; val=1024
│ k: type=int; val=1024
│ alpha: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a38)
│ A: type=float; val=POINTER (IN HEX:0x0x7f8993a00000)
│ lda: type=int; val=1024
│ B: type=float; val=POINTER (IN HEX:0x0x6200dc800)
│ ldb: type=int; val=100000
│ beta: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a3c)
│ C: type=float; val=POINTER (IN HEX:0x0x70e776000)
│ ldc: type=int; val=100000
│ Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899c3acd00); MathMode=CUBLAS_DEFAULT_MATH
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasSetStream_v2(cublasHandle_t, cudaStream_t) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│ streamId: type=SOME TYPE; val=POINTER (IN HEX:0x0x7f899f21b480)
│ Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899c3acd00); MathMode=CUBLAS_DEFAULT_MATH
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
~/mwe_xt_gemm/┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasSgemm_v2(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float*, const float*, int, const float*, int, const float*, float*, int) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│ transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ m: type=int; val=1024
│ n: type=int; val=1024
│ k: type=int; val=1024
│ alpha: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a38)
│ A: type=float; val=POINTER (IN HEX:0x0x7f898a400000)
│ lda: type=int; val=1024
│ B: type=float; val=POINTER (IN HEX:0x0x6200dd800)
│ ldb: type=int; val=100000
│ beta: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a3c)
│ C: type=float; val=POINTER (IN HEX:0x0x70e776000)
│ ldc: type=int; val=100000
│ Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899f21b480); MathMode=CUBLAS_DEFAULT_MATH
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
mwe.jl:16┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasSetStream_v2(cublasHandle_t, cudaStream_t) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│ streamId: type=SOME TYPE; val=POINTER (IN HEX:0x0x7f899c3acd00)
│ Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899f21b480); MathMode=CUBLAS_DEFAULT_MATH
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasSgemm_v2(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float*, const float*, int, const float*, int, const float*, float*, int) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│ transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ m: type=int; val=1024
│ n: type=int; val=1024
│ k: type=int; val=1024
│ alpha: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a38)
│ A: type=float; val=POINTER (IN HEX:0x0x7f8993a00000)
│ lda: type=int; val=1024
│ B: type=float; val=POINTER (IN HEX:0x0x6200de800)
│ ldb: type=int; val=100000
│ beta: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a3c)
│ C: type=float; val=POINTER (IN HEX:0x0x70e776000)
│ ldc: type=int; val=100000
│ Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899c3acd00); MathMode=CUBLAS_DEFAULT_MATH
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
in expression starting at /home/lpawela/mwe_xt_gemm/mwe.jl:16┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasSetStream_v2(cublasHandle_t, cudaStream_t) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│ streamId: type=SOME TYPE; val=POINTER (IN HEX:0x0x7f899f21b480)
│ Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899c3acd00); MathMode=CUBLAS_DEFAULT_MATH
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
┌ Debug: cuBLAS (v12.3) function cublasStatus_t cublasSgemm_v2(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float*, const float*, int, const float*, int, const float*, float*, int) called:
│ handle: type=cublasHandle_t; val=POINTER (IN HEX:0x0x6250fb0)
│ transa: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ transb: type=cublasOperation_t; val=CUBLAS_OP_N(0)
│ m: type=int; val=1024
│ n: type=int; val=1024
│ k: type=int; val=1024
│ alpha: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a38)
│ A: type=float; val=POINTER (IN HEX:0x0x7f898a400000)
│ lda: type=int; val=1024
│ B: type=float; val=POINTER (IN HEX:0x0x6200df800)
│ ldb: type=int; val=100000
│ beta: type=float; val=POINTER (IN HEX:0x0x7f89fcfd9a3c)
│ C: type=float; val=POINTER (IN HEX:0x0x70e776000)
│ ldc: type=int; val=100000
│ Time: 2024-04-10T21:32:50 elapsed from start 0.316667 minutes or 19.000000 seconds
│ Process=414562; Thread=140230631728704; GPU=0; Handle=POINTER (IN HEX:0x0x6250fb0); StreamId=POINTER (IN HEX:0x0x7f899f21b480); MathMode=CUBLAS_DEFAULT_MATH
│ COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
└ @ CUDA.CUBLAS ~/.julia/packages/CUDA/fGE8R/lib/cublas/CUBLAS.jl:224
As for the C++ it was a bit larger code, but this is the main part
template<class T=float>
void test(
cublasHandle_t &handle, cublasXtHandle_t &xtHandle,
curandGenerator_t &prng,
size_t n, size_t m, size_t k,
bool ah = false, bool bh = false, bool ch = false) {
T *A;
T *B;
T *C;
static T zero = 0.0;
static T one = 1.0;
static T mone = -1.0;
static T is2 = pow(T(0.5), T(0.5));
/*
CUDA_CALL(cudaMalloc((void **)(&A), n * m * sizeof(T)));
CUDA_CALL(cudaDeviceSynchronize());
if (n == m) {
T *HA;
CUDA_CALL(cudaMalloc((void **)(&HA), n * n * sizeof(T)));
CUDA_CALL(cudaDeviceSynchronize());
CURAND_CALL(curandGenerateNormalAny(prng, HA, n * m, zero, is2));
CUDA_CALL(cudaDeviceSynchronize());
CUBLAS_CALL(cublasgeam<T>(
handle, CUBLAS_OP_N, CUBLAS_OP_T,
n, n, &one, HA, n, &mone, HA, n, A, n));
CUDA_CALL(cudaDeviceSynchronize());
CUDA_CALL(cudaFree(HA)); HA = nullptr;
} else {
CURAND_CALL(curandGenerateNormalAny(prng, A, n * m, zero, one));
}
CUDA_CALL(cudaDeviceSynchronize());
*/
CUDA_CALL(cudaMalloc((void **)(&B), m * k * sizeof(T)));
CUDA_CALL(cudaDeviceSynchronize());
CURAND_CALL(curandGenerateNormalAny(prng, B, m * k, zero, one));
CUDA_CALL(cudaDeviceSynchronize());
CUDA_CALL(cudaMalloc((void **)(&C), n * k * sizeof(T)));
CUDA_CALL(cudaDeviceSynchronize());
cudaEvent_t start, stop;
/*if (ah) {
T *oM, *hM = (T *)(malloc(n * m * sizeof(T)));
CUDA_CALL(cudaMemcpy(
hM, A, n * m * sizeof(T), cudaMemcpyDeviceToHost));
oM = A; A = hM;
CUDA_CALL(cudaFree(oM)); oM = nullptr;
}*/
A = (T *)(malloc(n * m * sizeof(T)));
if (bh) {
T *oM, *hM = (T *)(malloc(m * k * sizeof(T)));
CUDA_CALL(cudaMemcpy(
hM, B, m * k * sizeof(T), cudaMemcpyDeviceToHost));
oM = B; B = hM;
CUDA_CALL(cudaFree(oM)); oM = nullptr;
}
if (ch) {
T *oM, *hM = (T *)(malloc(n * k * sizeof(T)));
CUDA_CALL(cudaMemcpy(
hM, C, n * k * sizeof(T), cudaMemcpyDeviceToHost));
oM = C; C = hM;
CUDA_CALL(cudaFree(oM)); oM = nullptr;
}
CUDA_CALL(cudaEventCreate(&start));
CUDA_CALL(cudaEventCreate(&stop));
CUDA_CALL(cudaEventRecord(start));
CUDA_CALL(cudaEventSynchronize(start));
CUBLAS_CALL(cublasXtgemm(
xtHandle, CUBLAS_OP_N, CUBLAS_OP_N,
n, k, m, &one, A, n, B, m, &zero, C, n));
CUDA_CALL(cudaDeviceSynchronize());
CUDA_CALL(cudaEventRecord(stop));
CUDA_CALL(cudaEventSynchronize(stop));
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
printf(
"%-8s %-6s %7lu %7lu %7lu %-6s %-6s %-6s %10.3f\n",
"cublasXt",
theTypename<T>(),
n, m, k,
ah ? "HOST" : "DEVICE",
bh ? "HOST" : "DEVICE",
ch ? "HOST" : "DEVICE",
milliseconds);
CUDA_CALL(cudaEventDestroy(stop));
CUDA_CALL(cudaEventDestroy(start));
CUDA_CALL(cudaDeviceSynchronize());
/*
T *dA = A, *dB = B, *dC = C;
if (ah) CUDA_CALL(cudaMalloc((void **)(&dA), n * m * sizeof(T)));
if (bh) CUDA_CALL(cudaMalloc((void **)(&dB), m * k * sizeof(T)));
if (ch) CUDA_CALL(cudaMalloc((void **)(&dC), n * k * sizeof(T)));
CUDA_CALL(cudaEventCreate(&start));
CUDA_CALL(cudaEventCreate(&stop));
CUDA_CALL(cudaEventRecord(start));
CUDA_CALL(cudaEventSynchronize(start));
if (ah)
CUDA_CALL(cudaMemcpy(
dA, A, n * m * sizeof(T), cudaMemcpyHostToDevice));
if (bh)
CUDA_CALL(cudaMemcpy(
dB, B, m * k * sizeof(T), cudaMemcpyHostToDevice));
if (ch)
CUDA_CALL(cudaMemcpy(
dC, C, n * k * sizeof(T), cudaMemcpyHostToDevice));
CUBLAS_CALL(cublasgemm(
handle, CUBLAS_OP_N, CUBLAS_OP_N,
n, k, m, &one, dA, n, dB, m, &zero, dC, n));
CUDA_CALL(cudaDeviceSynchronize());
CUDA_CALL(cudaEventRecord(stop));
CUDA_CALL(cudaEventSynchronize(stop));
if (ch) CUDA_CALL(cudaFree(dC));
dC = nullptr;
if (bh) CUDA_CALL(cudaFree(dB));
dB = nullptr;
if (ah) CUDA_CALL(cudaFree(dA));
dA = nullptr;
milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
printf(
"%-8s %-6s %7lu %7lu %7lu %-6s %-6s %-6s %10.3f\n",
"CUBLAS",
theTypename<T>(),
n, m, k,
ah ? "HOST" : "DEVICE",
bh ? "HOST" : "DEVICE",
ch ? "HOST" : "DEVICE",
milliseconds);
CUDA_CALL(cudaEventDestroy(stop));
CUDA_CALL(cudaEventDestroy(start));
CUDA_CALL(cudaDeviceSynchronize());
*/
ch ? free(C) : CUDA_CALL(cudaFree(C)); C = nullptr;
bh ? free(B) : CUDA_CALL(cudaFree(B)); B = nullptr;
ah ? free(A) : CUDA_CALL(cudaFree(A)); A = nullptr;
CUDA_CALL(cudaDeviceSynchronize());
}
int main() {
cublasHandle_t handle;
CUBLAS_CALL(cublasCreate(&handle));
cublasXtHandle_t xtHandle;
CUBLAS_CALL(cublasXtCreate(&xtHandle));
int device_count = 1;
CUDA_CALL(cudaGetDeviceCount(&device_count));
printf("device_count = %d\n", device_count);
int *device_ids = (int *)(malloc(device_count * sizeof(int)));
for (int idx = 0; idx < device_count; ++idx) {
device_ids[idx] = idx;
}
CUBLAS_CALL(cublasXtDeviceSelect(xtHandle, device_count, device_ids));
free(device_ids); device_ids = nullptr;
curandGenerator_t prng;
CURAND_CALL(curandCreateGenerator(&prng, CURAND_RNG_PSEUDO_XORWOW));
CURAND_CALL(curandSetPseudoRandomGeneratorSeed(prng, 0xDEADBEEF));
int p = 35000;
printf(
"%-8s %-6s %7s %7s %7s %-6s %-6s %-6s %10s\n",
"library", "type", "n", "m", "k",
"A_mem", "B_mem", "C_mem", "time [ms]");
//for (size_t idx = 0; idx < 8; ++idx) {
size_t idx = 1;
bool ah = idx & 1;
bool bh = idx & 2;
bool ch = idx & 4;
test(handle, xtHandle, prng, 10 * p, 10 * p, 1000, ah, bh, ch);
test<double>(handle, xtHandle, prng, 10 * p, 10 * p, 1000, ah, bh, ch);
//}
CURAND_CALL(curandDestroyGenerator(prng));
CUBLAS_CALL(cublasXtDestroy(xtHandle));
CUBLAS_CALL(cublasDestroy(handle));
CUDA_CALL(cudaDeviceSynchronize());
return 0;
}
C++ MWE that does reproduce the error:
#include <iostream>
#include <vector>
#include <cuda.h>
#include <cublasXt.h>
// Error checking for CUDA Driver API
#define CUDA_CHECK(call) { gpuAssert((call), __FILE__, __LINE__); }
inline void gpuAssert(CUresult code, const char *file, int line, bool abort=true) {
if (code != CUDA_SUCCESS) {
const char *error_string;
cuGetErrorString(code, &error_string);
std::cerr << "CUDA Driver API error: " << error_string << " at " << file << ":" << line << std::endl;
if (abort) exit(code);
}
}
// Error checking for CUBLAS API
#define CUBLAS_CHECK(status) { cublasAssert((status), __FILE__, __LINE__); }
inline void cublasAssert(cublasStatus_t status, const char *file, int line, bool abort=true) {
if (status != CUBLAS_STATUS_SUCCESS) {
std::cerr << "CUBLAS API error: " << status << " at " << file << ":" << line << std::endl;
if (abort) exit(status);
}
}
int main() {
// Initialize CUDA
CUDA_CHECK(cuInit(0));
// Set up primary contexts for both devices
std::vector<int> device_ids = {0, 1};
std::vector<CUcontext> contexts(device_ids.size());
for (auto id : device_ids) {
CUdevice cuDevice;
CUDA_CHECK(cuDeviceGet(&cuDevice, id));
CUDA_CHECK(cuDevicePrimaryCtxRetain(&contexts[id], cuDevice));
}
// Activate the first device's context
CUDA_CHECK(cuCtxSetCurrent(contexts[0]));
// Create a stream for operations
CUstream stream;
CUDA_CHECK(cuStreamCreate(&stream, CU_STREAM_DEFAULT));
// Allocate memory from the pool
CUdeviceptr d_A, d_B, d_C;
size_t m = 100000, n = 10000, k = 100000;
size_t bytes_A = m * k * sizeof(float);
size_t bytes_B = k * n * sizeof(float);
size_t bytes_C = m * n * sizeof(float);
CUDA_CHECK(cuMemAllocAsync(&d_A, bytes_A, stream));
CUDA_CHECK(cuMemAllocAsync(&d_B, bytes_B, stream));
CUDA_CHECK(cuMemAllocAsync(&d_C, bytes_C, stream));
// Set up CUBLAS Xt
cublasXtHandle_t xtHandle;
CUBLAS_CHECK(cublasXtCreate(&xtHandle));
// Configure CUBLAS Xt to use both devices
CUBLAS_CHECK(cublasXtDeviceSelect(xtHandle, device_ids.size(), device_ids.data()));
// Perform the matrix multiplication
float alpha = 1.0f, beta = 0.0f;
CUBLAS_CHECK(cublasXtSgemm(xtHandle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, (float*)d_A, m, (float*)d_B, k, &beta, (float*)d_C, m));
return 0;
}
It seems related to the cuMemAllocAsync
.
Filed a bug with NVIDIA. Workaround: run with JULIA_CUDA_MEMORY_POOL=none
in your environment.
I can confirm the workaround makes the error go away.