CNugteren / CLBlast

Tuned OpenCL BLAS

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

Python Memory Management

ZacharyVarley opened this issue · comments

I am probably missing something simple. After around 10 loops of calling syrk, I get out of memory error.

import numpy as np
import pyopencl as cl
from pyopencl.array import Array
import pyclblast


def blast_covariance_matrix(data, datatype='float32'):
    """Compute the covariance matrix of A.
    """
    ctx = cl.create_some_context()
    # Set up OpenCL using with so that the queue is automatically released
    with cl.CommandQueue(ctx) as queue:
        # Set up NumPy arrays
        n_samples, n_features = data.shape
        data = data.astype(datatype)
        # Set up OpenCL array
        cl_a = Array(queue, data.shape, data.dtype)
        cl_a.set(data)
        # Prepare an empty OpenCL array for the result
        cl_cov = Array(queue, (n_features, n_features), dtype=datatype)
        # Perform the dsyrk operation
        pyclblast.syrk(queue, n_features, n_samples, cl_a, cl_cov, n_features, n_features, alpha=1.0, beta=0.0,
                       lower_triangle=False, a_transp=True)
        # Transfer result from device to host and try to avoid memory leaks
        covariance_matrix = cl_cov.get()
        cl_a.finish()
        cl_cov.finish()
        queue.finish()
    return covariance_matrix


for i in range(100):
    data = np.random.rand(100000, 3600)
    cov = blast_covariance_matrix(data)
    print("finished iteration {}".format(i))

I found that the culprit was creating many contexts in a loop. Using a single ctx removes the issue:

import numpy as np
import pyopencl as cl
from pyopencl.array import Array
import pyclblast

# create a single context
ctx = cl.create_some_context()


def blast_covariance_matrix(data, datatype='float32'):
    """Compute the covariance matrix of A.
    """
    with cl.CommandQueue(ctx) as queue:
        # Set up NumPy arrays
        n_samples, n_features = data.shape
        data = data.astype(datatype)
        # Set up OpenCL array
        cl_a = Array(queue, data.shape, data.dtype)
        cl_a.set(data)
        # Prepare an empty OpenCL array for the result
        cl_cov = Array(queue, (n_features, n_features), dtype=datatype)
        # Perform the dsyrk operation
        pyclblast.syrk(queue, n_features, n_samples, cl_a, cl_cov, n_features, n_features, alpha=1.0, beta=0.0,
                       lower_triangle=False, a_transp=True)
        # Transfer result from device to host
        covariance_matrix = cl_cov.get()
    return covariance_matrix


for i in range(100):
    data = np.random.rand(100000, 3600)
    cov = blast_covariance_matrix(data)
    print("finished iteration {}".format(i))