USCqserver / OpenQuantumTools.jl

We've added a GPU solver

OpenQuantumTools.jl/src/QSolver/closed_system_solvers.jl

Lines 30 to 44 in f0e34ec

    
           function solve_schrodinger_gpu(A::Annealing, tf::Real; tspan = (0, tf), kwargs...) 
        
               u0 = cu(build_u0(A.u0, :v)) 
        
               p = ODEParams(A.H, float(tf), A.annealing_parameter) 
        
               update_func = function (C, u, p, t) 
        
                   update_cache!(C, p.L, p, p(t)) 
        
               end 
        
               cache = cu(get_cache(A.H)) 
        
               diff_op = DiffEqArrayOperator(cache, update_func = update_func) 
        
               jac_cache = cu(similar(cache)) 
        
               jac_op = DiffEqArrayOperator(jac_cache, update_func = update_func) 
        
               ff = ODEFunction(diff_op, jac_prototype = jac_op) 
        
               prob = ODEProblem{true}(ff, u0, Float32.(tspan), p) 
        
               solve(prob; alg_hints = [:nonstiff], kwargs...) 
        
           end

which will get integrated better soon. We've shown informally that for n = 10 qubits and tf = 10ns anneal, the GPU version takes around 1 second and the CPU version 8 seconds. We'd like to get more systematic benchmarking/ run data on these to show when you get and improvement and by how much.

The informal timing was done with the following test code:
https://github.com/naezzell/accelqat/blob/b617c423daaa4cb0ab2f4c1a4d8f2536fb9f7bb3/cuda/try_gpu_accel.jl#L1-L66
with 2 CPUs and 1 GPU on USC Discovery cluster.

For the GPU accelerated solver we have done a preliminary benchmarking for comparision. It shows advantage when init Hamiltonian is big enough.
The test code can be find here naezzell/accelqat/cuda/scaling_test.jl.
The test result and relevent CPU information can be find here naezzell/accelqat/cuda/scaling_test_result/ (with one NVIDIA Tesla K40 GPU)

	function solve_schrodinger_gpu(A::Annealing, tf::Real; tspan = (0, tf), kwargs...)
	u0 = cu(build_u0(A.u0, :v))
	p = ODEParams(A.H, float(tf), A.annealing_parameter)
	update_func = function (C, u, p, t)
	update_cache!(C, p.L, p, p(t))
	end
	cache = cu(get_cache(A.H))
	diff_op = DiffEqArrayOperator(cache, update_func = update_func)
	jac_cache = cu(similar(cache))
	jac_op = DiffEqArrayOperator(jac_cache, update_func = update_func)
	ff = ODEFunction(diff_op, jac_prototype = jac_op)

	prob = ODEProblem{true}(ff, u0, Float32.(tspan), p)
	solve(prob; alg_hints = [:nonstiff], kwargs...)
	end

Benchmarking time with CPU/GPU solvers for comparison