JuliaGPU / CUDA.jl

CUDA programming in Julia.

Home Page:https://juliagpu.org/cuda/

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

`@gcsafe_ccall` breaks inlining of ccall wrappers

maleadt opened this issue · comments

#2262 introduced a regression: Using @gcsafe_ccall instead of plain @ccall apparently makes it so that our ccall wrappers aren't fully inlined anymore, resulting in simple Ref boxes (which we need a ton of with the CUDA C APIs) start allocating.

MWE:

function ccall_macro_lower(func, rettype, types, args, nreq)
    # instead of re-using ccall or Expr(:foreigncall) to perform argument conversion,
    # we need to do so ourselves in order to insert a jl_gc_safe_enter|leave
    # just around the inner ccall

    cconvert_exprs = []
    cconvert_args = []
    for (typ, arg) in zip(types, args)
        var = gensym("$(func)_cconvert")
        push!(cconvert_args, var)
        push!(cconvert_exprs, :($var = Base.cconvert($(esc(typ)), $(esc(arg)))))
    end

    unsafe_convert_exprs = []
    unsafe_convert_args = []
    for (typ, arg) in zip(types, cconvert_args)
        var = gensym("$(func)_unsafe_convert")
        push!(unsafe_convert_args, var)
        push!(unsafe_convert_exprs, :($var = Base.unsafe_convert($(esc(typ)), $arg)))
    end

    call = quote
        $(unsafe_convert_exprs...)

        gc_state = @ccall(jl_gc_safe_enter()::Int8)
        ret = ccall($(esc(func)), $(esc(rettype)), $(Expr(:tuple, map(esc, types)...)),
                    $(unsafe_convert_args...))
        @ccall(jl_gc_safe_leave(gc_state::Int8)::Cvoid)
        ret
    end

   quote
        @inline
        $(cconvert_exprs...)
        GC.@preserve $(cconvert_args...) $(call)
    end
end

macro gcsafe_ccall(expr)
    ccall_macro_lower(Base.ccall_macro_parse(expr)...)
end

@inline function check(f)
    res = f()
    if res < 0
        throw(res)
    end

    return
end

function cuCtxGetCurrent(ptr)
    check() do
        @gcsafe_ccall time(C_NULL::Ptr{Cvoid}, ptr::Ptr{Int})::Cint
    end
end

function current_context()
    handle_ref = Ref{Int}()
    cuCtxGetCurrent(handle_ref)
    handle_ref[]
end

current_context()
@show @allocated current_context()

Ways to 'make' this inline again (and get 0 allocated bytes):

  • replace @gcsafe_ccall with plain @ccall
  • within the @gcsafe_ccall, remove the call to jl_gc_safe_leave
  • within current_context, call-site inline the call to cuCtxGetCurrent

I can understand how the added calls in the @gcsafe_ccall body push the function over the inlining limit, but it's very surprising to me that I can only force inlining by using a call-site @inline, and not by annotating the generated code or function with @inline. @aviatesk, is that expected behavior?

cc @vchuravy

Hm I wonder if the inline meta survives... I remember some code that scans he top of he function and if there is anything before that it might not see it?

Good suggestion, but doesn't seem to help. To make sure it's very much the first expression:

macro gcsafe_ccall(expr)
    ex = ccall_macro_lower(Base.ccall_macro_parse(expr)...)
    pushfirst!(ex.args, Expr(:meta, :inline))
    println(ex)
    return ex
end

... which gives

begin
    $(Expr(:meta, :inline))
    #= /tmp/wip.jl:35 =#
    var"##:time_cconvert#225" = Base.cconvert($(Expr(:escape, :(Ptr{Cvoid}))), $(Expr(:escape, :C_NULL)))
    var"##:time_cconvert#226" = Base.cconvert($(Expr(:escape, :(Ptr{Int}))), $(Expr(:escape, :ptr)))
    #= /tmp/wip.jl:36 =#
    #= /tmp/wip.jl:36 =# GC.@preserve var"##:time_cconvert#225" var"##:time_cconvert#226" begin
            #= /tmp/wip.jl:25 =#
            var"##:time_unsafe_convert#227" = Base.unsafe_convert($(Expr(:escape, :(Ptr{Cvoid}))), var"##:time_cconvert#225")
            var"##:time_unsafe_convert#228" = Base.unsafe_convert($(Expr(:escape, :(Ptr{Int}))), var"##:time_cconvert#226")
            #= /tmp/wip.jl:27 =#
            gc_state = #= /tmp/wip.jl:27 =# @ccall(jl_gc_safe_enter()::Int8)
            #= /tmp/wip.jl:28 =#
            ret = ccall($(Expr(:escape, :(:time))), $(Expr(:escape, :Cint)), ($(Expr(:escape, :(Ptr{Cvoid}))), $(Expr(:escape, :(Ptr{Int})))), var"##:time_unsafe_convert#227", var"##:time_unsafe_convert#228")
            #= /tmp/wip.jl:30 =#
            #= /tmp/wip.jl:30 =# @ccall jl_gc_safe_leave(gc_state::Int8)::Cvoid
            #= /tmp/wip.jl:31 =#
            ret
        end
end

... still allocates.

EDIT: wait, it did make me 'discover' that adding an @inline to the cuCtxGetCurrent function definition does seem to work? Let me verify if that also works in CUDA.jl proper.

e49197b... Not particularly happy with all the explicit inlining going on there, but yeah...

After experimenting a bit, it appears that this issue no longer occurs in versions 1.12 and above, probably thanks to some slight advancements in effect analysis. However, since these are quite delicate adjustments, I believe it's very beneficial to explicitly use the @inline annotation. Particularly, it's crucial to remove allocations for handle_ref, which makes inlining cuCtxGetCurrent necessary. Thus, adding an @inline to it is completely justified.

Thanks!