JuliaGPU / AMDGPUnative.jl

Julia interface to AMD/Radeon GPUs

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

Poor performance, possible kernel caching issue

nsailor opened this issue · comments

Running the following Julia script:

using HSARuntime, AMDGPUnative

const N = 1024;
const B = 1024 * 256;

da = HSAArray(zeros(Float32, N * B))

function add1!(da)
    i = workitemIdx().x + workgroupIdx().x * workgroupDim().x
    da[i] += 1.0
    return
end

# Don't benchmark the kernel compilation
@roc gridsize=B groupsize=N add1!(da)

function run_iterations()
    for i = 1:128
        wait(@roc gridsize=B groupsize=N add1!(da))
    end
end

@time run_iterations()

gives

14.511331 seconds (23.31 k allocations: 2.355 MiB)

Profiling with @profile suggests that a lot of time is spent recompiling the kernel and creating the HSA executable:

Overhead ╎ [+additional indent] Count File:Line; Function
=========================================================
     ╎14513 @Base/client.jl:506; _start()
     ╎ 14513 @Base/client.jl:313; exec_options(::Base.JLOptions)
     ╎  14513 @Base/client.jl:383; run_main_repl(::Bool, ::Bool, ::Bool, ::Bool, ::Bool)
     ╎   14513 @Base/essentials.jl:709; invokelatest
     ╎    14513 @Base/essentials.jl:710; #invokelatest#1
     ╎     14513 @Base/client.jl:399; (::Base.var"#802#804"{Bool,Bool,Bool,Bool})(::Mod...
     ╎    ╎ 14513 @REPL/src/REPL.jl:286; run_repl(::REPL.AbstractREPL, ::Any)
     ╎    ╎  14513 @REPL/src/REPL.jl:290; run_repl(::REPL.AbstractREPL, ::Any; backend_on_...
     ╎    ╎   14513 @REPL/src/REPL.jl:178; start_repl_backend(::REPL.REPLBackend, ::Any)
     ╎    ╎    14513 @REPL/src/REPL.jl:193; repl_backend_loop(::REPL.REPLBackend)
     ╎    ╎     14513 @REPL/src/REPL.jl:132; eval_user_input(::Any, ::REPL.REPLBackend)
    6╎    ╎    ╎ 14513 @Base/boot.jl:331; eval(::Module, ::Any)
     ╎    ╎    ╎  14507 /tmp/julia/test.jl:19; run_calculation()
     ╎    ╎    ╎   13046 @AMDGPUnative/src/execution.jl:187; macro expansion
     ╎    ╎    ╎    13046 @AMDGPUnative/src/execution.jl:349; (::Core.var"#Any##kw")(::NamedTuple{(:signal...
     ╎    ╎    ╎     13046 @AMDGPUnative/src/execution.jl:349; #_#123
     ╎    ╎    ╎    ╎ 13046 @AMDGPUnative/src/execution.jl:250; call##kw
     ╎    ╎    ╎    ╎  13046 @AMDGPUnative/src/execution.jl:250; #call#108
     ╎    ╎    ╎    ╎   13046 @AMDGPUnative/src/execution.jl:274; macro expansion
     ╎    ╎    ╎    ╎    13046 ...GPUnative/src/execution.jl:289; (::AMDGPUnative.var"#roccall##kw")(::Name...
     ╎    ╎    ╎    ╎     13046 ...GPUnative/src/execution.jl:294; roccall(::AMDGPUnative.HostKernel{add1!,...
     ╎    ╎    ╎    ╎    ╎ 13046 ...ive/src/execution_utils.jl:92; (::AMDGPUnative.var"#roccall##kw")(::Nam...
     ╎    ╎    ╎    ╎    ╎  13046 ...ve/src/execution_utils.jl:92; #roccall#88
     ╎    ╎    ╎    ╎    ╎   13046 ...ve/src/execution_utils.jl:97; (::AMDGPUnative.var"#_roccall##kw")(::N...
     ╎    ╎    ╎    ╎    ╎    13046 ...ve/src/execution_utils.jl:97; _roccall(::AMDGPUnative.RuntimeQueue{HS...
     ╎    ╎    ╎    ╎    ╎     13046 ...e/src/execution_utils.jl:119; macro expansion
     ╎    ╎    ╎    ╎    ╎    ╎ 13046 .../src/execution_utils.jl:148; launch
     ╎    ╎    ╎    ╎    ╎    ╎  13046 .../src/execution_utils.jl:153; _launch
     ╎    ╎    ╎    ╎    ╎    ╎   13045 .../src/execution_utils.jl:178; macro expansion
     ╎    ╎    ╎    ╎    ╎    ╎    13045 ...PUnative/src/runtime.jl:28; create_executable
     ╎    ╎    ╎    ╎    ╎    ╎     2763  ...Unative/src/runtime.jl:34; create_executable(::AMDGPUnative.De...
     ╎    ╎    ╎    ╎    ╎    ╎    ╎ 2763  @Base/file.jl:657; mktemp
     ╎    ╎    ╎    ╎    ╎    ╎    ╎  2760  @Base/file.jl:659; mktemp(::AMDGPUnative.var"#2#3"{R...
     ╎    ╎    ╎    ╎    ╎    ╎    ╎   2760  ...native/src/runtime.jl:38; (::AMDGPUnative.var"#2#3"{ROCFunc...
     ╎    ╎    ╎    ╎    ╎    ╎    ╎    2760  @Base/process.jl:438; run
     ╎    ╎    ╎    ╎    ╎    ╎    ╎     2760  @Base/process.jl:440; run(::Cmd; wait::Bool)
     ╎    ╎    ╎    ╎    ╎    ╎    ╎    ╎ 2760  @Base/process.jl:483; success
     ╎    ╎    ╎    ╎    ╎    ╎    ╎    ╎  2760  @Base/process.jl:622; wait(::Base.Process)
     ╎    ╎    ╎    ╎    ╎    ╎    ╎    ╎   2760  @Base/condition.jl:106; wait(::Base.GenericCondition{B...
     ╎    ╎    ╎    ╎    ╎    ╎    ╎    ╎    2760  @Base/task.jl:712; wait
 2759╎    ╎    ╎    ╎    ╎    ╎    ╎    ╎     2760  @Base/task.jl:704; poptask(::Base.InvasiveLinke...
     ╎    ╎    ╎    ╎    ╎    ╎    ╎  1     @Base/file.jl:662; mktemp(::AMDGPUnative.var"#2#3"{R...
     ╎    ╎    ╎    ╎    ╎    ╎    ╎   1     @Base/iostream.jl:44; close
    1╎    ╎    ╎    ╎    ╎    ╎    ╎    1     @Base/lock.jl:106; unlock(::ReentrantLock)
     ╎    ╎    ╎    ╎    ╎    ╎    ╎  2     @Base/file.jl:663; mktemp(::AMDGPUnative.var"#2#3"{R...
     ╎    ╎    ╎    ╎    ╎    ╎    ╎   2     @Base/file.jl:260; rm
     ╎    ╎    ╎    ╎    ╎    ╎    ╎    1     @Base/file.jl:260; rm(::String; force::Bool, recurs...
     ╎    ╎    ╎    ╎    ╎    ╎    ╎     1     @Base/stat.jl:314; islink
    1╎    ╎    ╎    ╎    ╎    ╎    ╎    ╎ 1     @Base/stat.jl:67; lstat(::String)
     ╎    ╎    ╎    ╎    ╎    ╎    ╎    1     @Base/file.jl:268; rm(::String; force::Bool, recurs...
    1╎    ╎    ╎    ╎    ╎    ╎    ╎     1     @Base/file.jl:887; unlink(::String)
     ╎    ╎    ╎    ╎    ╎    ╎     2     ...Unative/src/runtime.jl:41; create_executable(::AMDGPUnative.De...
     ╎    ╎    ╎    ╎    ╎    ╎    ╎ 2     @Base/io.jl:405; read(::String)
     ╎    ╎    ╎    ╎    ╎    ╎    ╎  2     @Base/io.jl:321; open
     ╎    ╎    ╎    ╎    ╎    ╎    ╎   2     @Base/io.jl:323; open(::Base.var"#291#292"{Tuple{}...
     ╎    ╎    ╎    ╎    ╎    ╎    ╎    2     @Base/io.jl:405; #291
     ╎    ╎    ╎    ╎    ╎    ╎    ╎     2     @Base/iostream.jl:502; read(::IOStream)
    2╎    ╎    ╎    ╎    ╎    ╎    ╎    ╎ 2     @Base/iostream.jl:444; readbytes_all!(::IOStream, ::Ar...
     ╎    ╎    ╎    ╎    ╎    ╎     10280 ...Unative/src/runtime.jl:44; create_executable(::AMDGPUnative.De...
     ╎    ╎    ╎    ╎    ╎    ╎    ╎ 10280 ...ime/src/HSARuntime.jl:236; HSAExecutable(::HSAAgent, ::Array{...
10280╎    ╎    ╎    ╎    ╎    ╎    ╎  10280 ...src/HSA/libhsa_api.jl:576; executable_load_agent_code_object
     ╎    ╎    ╎    ╎    ╎    ╎   1     .../src/execution_utils.jl:179; macro expansion
     ╎    ╎    ╎    ╎    ╎    ╎    1     ...PUnative/src/runtime.jl:50; create_kernel
     ╎    ╎    ╎    ╎    ╎    ╎     1     ...Unative/src/runtime.jl:52; create_kernel
     ╎    ╎    ╎    ╎    ╎    ╎    ╎ 1     ...ime/src/HSARuntime.jl:299; HSAKernelInstance(::HSAAgent, ::HS...
    1╎    ╎    ╎    ╎    ╎    ╎    ╎  1     ...src/HSA/libhsa_api.jl:477; memory_allocate
     ╎    ╎    ╎   1461  @AMDGPUnative/src/runtime.jl:23; wait
     ╎    ╎    ╎    1461  @AMDGPUnative/src/runtime.jl:23; #wait#1
     ╎    ╎    ╎     1461  @HSARuntime/src/HSARuntime.jl:575; wait
     ╎    ╎    ╎    ╎ 1461  @HSARuntime/src/HSARuntime.jl:581; wait(::HSASignal; soft::Bool, minlat::Float64)
     ╎    ╎    ╎    ╎  1461  @Base/asyncevent.jl:213; sleep
     ╎    ╎    ╎    ╎   1461  @Base/asyncevent.jl:128; wait
     ╎    ╎    ╎    ╎    1461  @Base/asyncevent.jl:110; _trywait(::Timer)
     ╎    ╎    ╎    ╎     1461  @Base/condition.jl:106; wait(::Base.GenericCondition{Base.Thread...
     ╎    ╎    ╎    ╎    ╎ 1461  @Base/task.jl:712; wait
 1461╎    ╎    ╎    ╎    ╎  1461  @Base/task.jl:704; poptask(::Base.InvasiveLinkedListSynchr...
Total snapshots: 14514

For reference this equivalent HIP program runs in roughly 2.3 seconds on the same machine.

#include <hip/hip_runtime.h>
#include <stdio.h>
#include <stdlib.h>

const int N = 1024;
const int B = 1024 * 256;
const int iter = 128;

typedef float T;

constexpr int bytesize = N * B * sizeof(T);

__global__ void add1(T *da, int N) {
    int i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
    if (i < N) {
        da[i] += static_cast<T>(1);
    }
}

int main() {
    T *a = new T[N * B];
    for (int i = 0; i < (N * B); ++i) {
        a[i] = static_cast<T>(0);
    }

    T *da;
    hipMalloc(&da, bytesize);

    hipMemcpyHtoD(da, a, bytesize);

    for (int i = 0; i < iter; ++i) {
        hipLaunchKernelGGL(add1, B, N, 0, 0, da, N * B);
        hipDeviceSynchronize();
    }

    hipMemcpyDtoH(a, da, bytesize);

    bool passed = true;
    for (int i = 0; i < (N * B); ++i) {
        if (a[i] != static_cast<T>(iter)) {
            passed = false;
            break;
        }
    }

    if (!passed) {
        fprintf(stderr, "Test failed. Results incorrect!\n");
    }

    hipFree(da);
    delete[] a;
}

Yes, this is definitely an issue with how "not lazy enough" our code object/executable path is right now. We'll probably need to maintain a cache that maps from a given method signature to a final executable. It'd probably need to be a 2-level cache, where the first level is an LRU cache of in-memory (HSA loaded) executables, and the second level is backed by the path to the linked binaries on disk (this cache could be LRU as well, just much larger than the first level)

We'd also need a way to determine when to invalidate an entry in these caches, which would happen when a function definition changes. @maleadt do we have any infrastructure in GPUCompiler for communicating to the backends when cached_compilation has had to recompile a cached method?

We'll probably need to maintain a cache that maps from a given method signature to a final executable.

That's what the compilation cache is: https://github.com/JuliaGPU/GPUCompiler.jl/blob/c46c146447cb0bd34e1cfd0662fd20f8dd7b6b13/src/cache.jl#L10-L17
There should be no need for additional caches.

Why would the back-end need to know when GPUCompiler had to recompile a method? The old one might still be running, or valid in another world, so it's not like you can unload the method anyway.

Caching on-disk is impossible, since we can't query whether code has changed across Julia sessions.

It's not that I want to invalidate entries GPUCompiler's compile cache, but that we have an extra layer of (very expensive) linking and ELF loading that needs to occur after GPUCompiler has returned the object file to us.

However, I see what you mean; in CUDAnative, for example, _cufunction does any extra compilation/linking work using tools external to GPUCompiler, which is where we need to insert our ld.lld linking and HSA code object loading. Thanks for the insight!

julia> @time run_iterations()
  1.478988 seconds (10.76 k allocations: 331.109 KiB)

julia> Profile.print()
Overhead ╎ [+additional indent] Count File:Line; Function
=========================================================
    ╎1514 @Base/client.jl:506; _start()
    ╎ 1514 @Base/client.jl:313; exec_options(::Base.JLOptions)
    ╎  1514 @Base/client.jl:383; run_main_repl(::Bool, ::Bool, ::Bool, ::Bool, ::Bool)
    ╎   1514 @Base/essentials.jl:709; invokelatest
    ╎    1514 @Base/essentials.jl:710; #invokelatest#11514 @Base/client.jl:399; (::Base.var"#802#804"{Bool,Bool,Bool,Bool})(::Module)
    ╎    ╎ 1514 @REPL/src/REPL.jl:288; run_repl(::REPL.AbstractREPL, ::Any)
    ╎    ╎  1514 @REPL/src/REPL.jl:292; run_repl(::REPL.AbstractREPL, ::Any; backend_on_current_task::Bool)
    ╎    ╎   1514 @REPL/src/REPL.jl:180; start_repl_backend(::REPL.REPLBackend, ::Any)
    ╎    ╎    1514 @REPL/src/REPL.jl:195; repl_backend_loop(::REPL.REPLBackend)
    ╎    ╎     1514 @REPL/src/REPL.jl:134; eval_user_input(::Any, ::REPL.REPLBackend)
   5╎    ╎    ╎ 1514 @Base/boot.jl:331; eval(::Module, ::Any)
    ╎    ╎    ╎  1509 REPL[7]:3; run_iterations()
    ╎    ╎    ╎   1509 @AMDGPUnative/src/runtime.jl:23; wait
    ╎    ╎    ╎    1509 @AMDGPUnative/src/runtime.jl:23; #wait#1
    ╎    ╎    ╎     1509 @HSARuntime/src/signal.jl:25; wait
    ╎    ╎    ╎    ╎ 1509 @HSARuntime/src/signal.jl:31; wait(::HSASignal; soft::Bool, minlat::Float64)
    ╎    ╎    ╎    ╎  1509 @Base/asyncevent.jl:213; sleep
    ╎    ╎    ╎    ╎   1509 @Base/asyncevent.jl:128; wait
    ╎    ╎    ╎    ╎    1509 @Base/asyncevent.jl:110; _trywait(::Timer)
    ╎    ╎    ╎    ╎     1509 @Base/condition.jl:106; wait(::Base.GenericCondition{Base.Threads.SpinLock})
    ╎    ╎    ╎    ╎    ╎ 1509 @Base/task.jl:712; wait
1508╎    ╎    ╎    ╎    ╎  1509 @Base/task.jl:704; poptask(::Base.InvasiveLinkedListSynchronized{Task})
Total snapshots: 1516

The rest of the remaining time is waiting on the kernels to complete.