CNugteren / CLBlast

Tuned OpenCL BLAS

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

Multi-GPU, multi-threaded invocation of CLBlastSgemm seems to be unreliable.

shekarman1 opened this issue · comments

Hi,

Here is a test example that illustrates the problem. I am running Windows 10 and have two Intel GPUs (ARC A770) using Intel's OpenCL implementation and CLBlast (version 1.6.0). Test code given below.

M, N, K = 100
Compute Sgemm 200 times.

The test code implements two scenarios:

  1. Run the GEMM operations in separate threads but in sequence -- always works
  2. Run the GEMM operations in separate threads but in parallel - works only sometimes. Other times I get this error:

CLBLast (unexpected): bad allocation with error code -2039

Command lines are:

  1. GemmMultiThreaded.exe -gpus 2 (to run the operations in sequence)
  2. GemmMultiThreaded.exe -gpus 2 -parallel (to run the operations in parallel)

Any ideas on what I am doing wrong here? Thanks in advance.

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <assert.h>
#include <errno.h>
#include <math.h>
#include <stdarg.h>

#include <windows.h>
#include <direct.h>
#include <io.h>

#include <thread>


#include <CL/cl.h>
#include <clblast_c.h>

struct ThreadArgs {
    int gpuId;
    int m, n, k;
    cl_mem bufA, bufB, bufC;
    float alpha, beta;
    std::thread* tid;
};

cl_device_id devices[8];
cl_context context;
cl_platform_id platformId[1];
cl_command_queue clblasStreams[8];
cl_program program;
ThreadArgs threadArgs[8];

void contextCallback(const char* errInfo, const void* privateInfo, size_t cb, void* userData) {
  printf("contextCallback %s\n", errInfo);
}


void checkError(const int line, int gpuId, cl_int err) {
  if (err != CL_SUCCESS) {
    fprintf(stderr, "%d:%d: OpenCL err %d\n", __LINE__, gpuId, err);
  }
}

void* invokeGemm(void* args) {
  ThreadArgs* threadArgs;
  cl_int err;

  threadArgs = (ThreadArgs*) args;
  printf("Starting thread for GPU %d\n", threadArgs->gpuId);
  for (int i = 0; i < 200; i++) {
    err = CLBlastSgemm(CLBlastLayoutColMajor, CLBlastTransposeNo, CLBlastTransposeNo, threadArgs->n, threadArgs->m, threadArgs->k, threadArgs->alpha, threadArgs->bufB, 0, threadArgs->k, threadArgs->bufA, 0, threadArgs->n, threadArgs->beta, threadArgs->bufC, 0, threadArgs->n, &clblasStreams[threadArgs->gpuId], NULL);
    checkError(__LINE__, threadArgs->gpuId, err);
  }
  printf("GPU %d done\n", threadArgs->gpuId);
  return NULL;
}

void testGemm(int ngpus, int m, int n, int k, int parallel) {
  float* cpuA, *cpuB, *cpuC;
  cl_int err;

  cpuA = (float*) calloc(1, sizeof(float) * m * k);
  cpuB = (float*) calloc(1, sizeof(float) * k * n);
  cpuC = (float*) calloc(1, sizeof(float) * m * n);

  for (int i = 0; i < m * k; i++) {
    cpuA[i] = (float) i;
  }
  for (int i = 0; i < k * n; i++) {
    cpuB[i] = (float) i;
  }


  for (int i = 0; i < ngpus; i++) {
    threadArgs[i].gpuId = i;
    threadArgs[i].m = m;
    threadArgs[i].n = n;
    threadArgs[i].k = k;

    threadArgs[i].bufA = clCreateBuffer(context, CL_MEM_READ_ONLY, m * k * sizeof(float), NULL, &err);
    checkError(__LINE__, i, err);

    threadArgs[i].bufB = clCreateBuffer(context, CL_MEM_READ_ONLY, k * n * sizeof(float), NULL, &err);
    checkError(__LINE__, i, err);

    threadArgs[i].bufC = clCreateBuffer(context, CL_MEM_READ_WRITE, m * n * sizeof(float), NULL, &err);
    checkError(__LINE__, i, err);

    clblasStreams[i] = clCreateCommandQueueWithProperties(context, devices[i], NULL, &err);
    checkError(__LINE__, i, err);

    err = clEnqueueWriteBuffer(clblasStreams[i], threadArgs[i].bufA, CL_TRUE, 0, m * k * sizeof(float), cpuA, 0, NULL, NULL);
    checkError(__LINE__, i, err);

    err = clEnqueueWriteBuffer(clblasStreams[i], threadArgs[i].bufB, CL_TRUE, 0, k * n * sizeof(float), cpuB, 0, NULL, NULL);
    checkError(__LINE__, i, err);

    threadArgs[i].tid = new std::thread(invokeGemm, (void*) &threadArgs[i]);
    if (parallel == 0) {
      threadArgs[i].tid->join();
    }
  }

  if (parallel) {
    for (int i = 0; i < ngpus; i++) {
      threadArgs[i].tid->join();
    }
  }
}

int main(int argc, char* argv[]) {
  int i, m, n, k, parallel = 0;
  cl_int err;
  size_t size;
  cl_uint numPlatforms, ngpus, rgpus = 2;
  char buffer[2048];

  buffer[0] = '\0';

  m = n = k = 100;
  for (i = 1; i < argc; i++) {
    if  (strcmp(argv[i], "-M") == 0) {
      m = atoi(argv[i + 1]);
      i++;
    } else if (strcmp(argv[i], "-N") == 0) {
      n = atoi(argv[i + 1]);
      i++;
    } else if (strcmp(argv[i], "-K") == 0) {
      k = atoi(argv[i + 1]);
      i++;
    } else if (strcmp(argv[i], "-gpus") == 0) {
      rgpus = atoi(argv[i + 1]);
      i++;
    } else if (strcmp(argv[i], "-parallel") == 0) {
      parallel = 1;
    } else {
      break;
    }
  }

  fprintf(stderr, "Testing with M %d N %d K %d\n", m, n, k);

  err = clGetPlatformIDs(1, &platformId[0], &numPlatforms);
  if (err != CL_SUCCESS) {
    fprintf(stderr, "Could not find an OpenCL platform, error %d\n", err);
  }

  if (numPlatforms == 0) {
    fprintf(stderr, "No OpenCL platforms found");
    return -1;
  }

  if (numPlatforms > 1) {
    fprintf(stderr, "Found more than one OpenCL platform. Choosing first.");
  }

  err = clGetPlatformInfo(platformId[0], CL_PLATFORM_PROFILE, sizeof(buffer), buffer, &size);
  fprintf(stdout, "Platform profile: %s", buffer);

  err = clGetPlatformInfo(platformId[0], CL_PLATFORM_VERSION, sizeof(buffer), buffer, &size);
  fprintf(stdout, "Platform version: %s", buffer);

  err = clGetPlatformInfo(platformId[0], CL_PLATFORM_NAME, sizeof(buffer), buffer, &size);
  fprintf(stdout, "Name: %s\n", buffer);

  cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties) platformId[0], 0};
  err = clGetDeviceIDs(platformId[0], CL_DEVICE_TYPE_GPU, 8, &devices[0], &ngpus);
  if (err != CL_SUCCESS) {
    return err;
  }

  if (ngpus < rgpus) {
    fprintf(stderr, "Requested using %d GPUs, only %d available. Continuing with %d gpus\n", rgpus, ngpus, ngpus);
  } else if (rgpus < ngpus) {
    fprintf(stderr, "Found %d gpus, requested %d, only using %d\n", ngpus, rgpus, rgpus);
  }

  ngpus = min(rgpus, ngpus);
  context = clCreateContext(properties, ngpus, devices, contextCallback, NULL, &err);
  if (err != CL_SUCCESS) {
    fprintf(stderr, "Could not create GPU context: %d\n", err);
    return err;
  }

  testGemm(ngpus, m, n, k, parallel);
  return err;
}

Could you maybe edit your post to put your code in a proper formatted block ```cpp <code here> ``` and with indentation? Or alternative remove the code completely and attach the .cpp file? That makes it a bit more readable.

Note that in a branch a while back I've made a multi-threading test but that was never merged (I forgot the reason why). Perhaps you can try it out? Here is the branch: master...multithreading_test (I just merged the latest master in to make it up-to-date).

First, thank you for the prompt response.

Apologies for the unformatted code. I tried attaching .cpp but github wouldn't let me. I edited the OP with the correct formatting.

And, thank you for the branch. I will check it and keep you posted.

@CNugteren I tested with the branch as you recommended. I am seeing the same behavior as before -- still getting -2039 when I run the above test example in parallel. Sequentially, it works (as before).

Thanks for testing. But I'm not sure if you did what I actually meant: compile and run the ./clblast_test_multithreading program which is available in that branch.

But other than that, I'll try to find some time to run your code example and see if I can reproduce the issue.

I tried to reproduce the issue but I can't. I think this is because I don't have a multi-GPU system myself to test on.

A few suggestions after looking at your code that you can still try (other than run the earlier mentioned multithreading test):

  • Use an event and wait for completion of the event within the thread using clWaitForEvents(1, &event);, as done in the CLBlast samples.
  • A related issue was reported (and solved) a while ago here: #155. Perhaps you can have a look at that sample code and see what is different in your case.
  • Did you try to replace the call to CLBlast with a call to launch another OpenCL kernel, just something simple hand-made for example? You can e.g. take one from here. That way we'll know if the issue is with CLBlast or with the remainder of your code.
  • I see a few globals and no clRelease* functions in your code. When I ran it on my machine it also segfaulted at the very end. Perhaps you can try to clean-up your code: release all memory and don't use globals. In particular, you can try to use some OpenCL data-types perhaps within your thread, e.g. the streams can be made within a thread I suppose?
  • You are using one OpenCL context for everything. Does the problem go away if you use one context per device instead?

I hope that with these suggestions you are able to debug the issue further. Let me know if any of the above is unclear.

@CNugteren Thank you for a detailed response. The use case I am trying to implement is to run a multi-threaded multi-GPU program where each thread can do its own invocation of kernels and GEMM. In response to your suggestions:

  1. I have clWaitEvent() or rather clFinish() calls in thread in my main program but neglected to put it in the example code I posted. I am not sure that waiting of the event is functionally relevant unless you are saying I can''t invoke multiple GEMM calls in the same thread. I will change my example code and test it to see if that indeed is the problem -- though in my main program, I am sure multiple GEMM calls in the same thread are not causing an issue.
  2. The issue you reference is a single thread working with multiple GPUs.
  3. Yes. I have other kernels that are working fine in a multi-threaded, mulit-GPU environment - in a single context.
  4. Yes, I am not releasing memory in the example code I posted. I will fix that in the example code.
  5. I need to use one OpenCL context (there are other reasons that mandate this).

One other thing you could do is compile CLBlast in verbose mode. That way messages will be printed, giving some indication of when the errors appear and what it was doing before that.

@CNugteren An update. The problem I am encountering is related thread safety -- and not multi-GPU. I can run on multiple GPUs using a single context. As you suggested, I am building with VERBOSE to see if get any more information.

@CNugteren When I run with version of CLBlast built with VERBOSE=ON, the test case works. See attached for log. But if I run it with regular CLBlast (no VERBOSE) the test case fails with this error:

------------------------------ non VERBOSE -----------------
Testing with M 100 N 100 K 100
Platform profile: FULL_PROFILEPlatform version: OpenCL 3.0 Name: Intel(R) OpenCL Graphics
Starting thread for GPU 0
Starting thread for GPU 1
CLBlast (unexpected): Internal logic error: Cache::Store: object already in cache
43:0: OpenCL err -2039
GPU 1 done
GPU 0 done


clblast_verbose.log

OK thank you, that is useful feedback. It looks like the cache (to store compiled objects to avoid re-compilation) mutex here doesn't properly work with multi-threading. Probably if you enable verbose mode you are just lucky/unlucky to not encounter the bug. Most likely in your original program if you add a sleep/wait at the right time you can also make it work, but that is not a solution of course.

A small question. When you say:

I can run on multiple GPUs using a single context

You mean you ran the sample code from your first post here, right? What did you modify exactly to make it work on a single GPU? Then I can also try to reproduce the issue, which makes debugging easier. Thanks!

I'll investigate further.

On your question of what I changed to make the sample code work for multiple GPUs:

I upgraded the OpenCL driver (I have Intel GPUs) and Intel acknowledged that they had a bug in the older version of OpenCL driver and the newer version has a fix for it. I can dig up the driver version numbers if it is of interest.

Also, I am now compiling for OpenCL 3.0 (before the driver upgrade, I had to specify 2.2 when I compiled my code. I am not sure this is really relevant but this is one other thing I changed. And finally, I am now using clang compiler (versus gcc) -- also seems irrelevant.

Thank you for looking into this issue and your prompt responses.

I'm still not able to reproduce locally unfortunately, because I have one GPU and also in my single-GPU multi-threaded test my OpenCL driver (also Intel) crashes. However, I did look into the code a bit and it might be that I found the bug. Also the cache actions are locked by a mutex, it might still be that two threads both check after each other if object X is in the cache, then thread 1 stores X in the cache and then later thread 2 also wants to store X in the cache, because they both couldn't retrieve it from the cache at first. Actually that might be perfectly valid behaviour.

So I think the solution here might be simple: replace the throw LogicError(); here with a simple return;.

Could you try that for me? Thanks! And do make sure speed is still OK for the second and subsequent runs of a kernel, because the cache is meant to store compiled objects that can be re-used the next time.

Good catch on the issue -- thanks. Yes, I will definitely try this and report back. It will take me a day or so to try this -- I have a long running test on the machine that has 2 Intel GPUs and it is not "free." Likely tomorrow.

@CNugteren Your suggestion to inhibit throwing the exception/error and just returning works! I am testing now. If you like, I can check the fix into a new branch and push it.

Great!

If you like, I can check the fix into a new branch and push it.

As you want. If you prefer I can also do it, no worries. If you do it, do not forget to add a small note in the CHANGELOG file about the fix.