Cublas not working within kernel once compiled to cubin using -G flag with nvcc

Question

I have a CUDA kernel that looks like the following:

#include <cublas_v2.h>
#include <math_constants.h>
#include <stdio.h>


extern "C" {

    __device__ float ONE = 1.0f;
    __device__ float M_ONE = -1.0f;
    __device__ float ZERO = 0.0f;

    __global__ void kernel(float *W, float *input, int i, float *output, int o) {
        int idx = blockIdx.x*blockDim.x+threadIdx.x;
        cublasHandle_t cnpHandle;

        if(idx == 0) {
            cublasCreate(&cnpHandle);
            cublasStatus_t s = cublasSgemv(cnpHandle, CUBLAS_OP_N, o, i, &ONE, W, o, input, 1, &ZERO, output, 1);
            printf("status %d\n", s);
            cudaError_t e = cudaDeviceSynchronize();
            printf("sync %d\n", e);
        }

    }

}

The host code:

#include <iostream>
#include <numeric>
#include <stdlib.h>
#include <cstring>
#include <cuda_runtime.h>
#include <cuda.h>
#include <cublas_v2.h>

extern "C" {
    __global__ void kernel(float *W, float *input, int i, float *output, int o);
}

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess)
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

int main(int argc, char* argv[])
{

    cuInit(0);
    CUcontext pctx;
    CUdevice dev;
    cuDeviceGet(&dev, 0);
    cuCtxCreate(&pctx, 0, dev);

    CUmodule module;
    CUresult t = cuModuleLoad(&module, "pathto/src/minimalKernel.cubin");

    CUfunction function;
    CUresult r = cuModuleGetFunction(&function, module, "kernel");

    float *W = new float[2];
    W[0] = 0.1f;
    W[1] = 0.1f;
    float *input = new float[2];
    input[0] = 0.1f;
    input[1] = 0.1f;
    float *out = new float[1];
    out[0] = 0.0f;

    int i = 2;
    int o = 1;

    float *d_W;
    float *d_input;
    float *d_out;
    cudaMalloc((void**)&d_W, 2*sizeof(float));
    cudaMalloc((void**)&d_input, 2*sizeof(float));
    cudaMalloc((void**)&d_out, sizeof(float));
    cudaMemcpy(d_W, W, 2*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_input, input, 2*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_out, out, sizeof(float), cudaMemcpyHostToDevice);
    //kernel<<<1, 2>>>(d_W, d_input, i, d_out, o);

    //cudaMemcpy(out, d_out, sizeof(float), cudaMemcpyDeviceToHost);

    //std::cout<<"out:"<<out[0]<<std::endl;

    void * kernelParams[] { &d_W, &d_input, &i, &d_out, &o };

    CUresult k = cuLaunchKernel(function, 1, 1, 1, 2, 1, 1, 0, 0, (void**)kernelParams, 0);
    gpuErrchk( cudaPeekAtLastError() );
    gpuErrchk( cudaDeviceSynchronize() );
    cudaMemcpy(out, d_out, sizeof(float), cudaMemcpyDeviceToHost);

    std::cout<<"out:"<<out[0]<<std::endl;


}

When this kernel runs inline kernel<<<1,2>>>(), built and linked (within eclipse Nsight), the kernel runs completely fine and out returns 0.02 as expected.

If I compile the kernel into a .cubin using -G (generate device debugging symbols), the cublas function never runs, and the out is always 0.0

I can put breakpoints in when the .cubin is running and I can see the data is correct going into the cublas function, but it looks like the cublas function never runs at all. The cublas function also always is returning 0 CUDA_SUCCESS. Importantly this ONLY happens when running this from a .cubin

To compile to a cubin I am using with the -G:

nvcc -G -cubin -arch=sm_52 --device-c kernel.cu -o kernel.cubin -dlink -lcudadevrt -lcublas_device

which returns no errors.

Why would the cublas functions within the .cubin stop working if the -G option is added?

CUDA 7.0 linux 14.04 x64 980GTX

provide an MCVE, including the code you are using to load and call the kernel. — Robert Crovella, Aug 19 '15 at 22:08

score 1 · Answer 1 · edited May 23 '17 at 12:15

FWIW, your code does not run correctly for me with or without the -G switch. You can run your code with cuda-memcheck to help identify errors. (You don't appear to be doing proper CUDA error checking, either in your host code or your device code. With dynamic parallelism, you can use a similar methodology in device code. And the CUBLAS API calls return error codes which you don't appear to be checking.)

This is wrong:

    if(idx == 0) {
        cublasCreate(&cnpHandle);
    }

This is a thread-local variable:

cublasHandle_t cnpHandle;

Since you are launching a kernel with 2 threads:

CUresult k = cuLaunchKernel(function, 1, 1, 1, 2, 1, 1, 0, 0, (void**)kernelParams, 0);

One of your threads (0) is passing a valid handle to the cublasSgemv call, and the other thread (1) is not.

When I fix that error, your code "works" for me. Note that you still have a situation where you are passing the exact same paramters to the cublasSgemv call for each of your two threads. Therefore, each call is writing to the same output location. Since the order of thread execution/behavior in this case is unspecified, it's possible you could see quite variable behavior: appearing to get valid output (since one thread wrote the correct value as the result of a successful cublas call) even though the other cublas call failed. It's possible, I suppose, that the -G switch might affect this ordering, or somehow impact this behavior.

$ cat t889_kern.cu
#include <cublas_v2.h>
#include <math_constants.h>
#include <stdio.h>


extern "C" {

    __device__ float ONE = 1.0f;
    __device__ float M_ONE = -1.0f;
    __device__ float ZERO = 0.0f;

    __global__ void kernel(float *W, float *input, int i, float *output, int o) {
//        int idx = blockIdx.x*blockDim.x+threadIdx.x;
        cublasHandle_t cnpHandle;

        cublasCreate(&cnpHandle);

        cublasSgemv(cnpHandle, CUBLAS_OP_N, o, i, &ONE, W, o, input, 1, &ZERO, output, 1);
        cudaDeviceSynchronize();
    }

}
$ cat t889.cpp
#include <iostream>
#include <numeric>
#include <stdlib.h>
#include <cstring>
#include <cuda_runtime.h>
#include <cuda.h>
#include <cublas_v2.h>

extern "C" {
    __global__ void kernel(float *W, float *input, int i, float *output, int o);
}

int main(int argc, char* argv[])
{

    cuInit(0);
    CUcontext pctx;
    CUdevice dev;
    cuDeviceGet(&dev, 0);
    cuCtxCreate(&pctx, 0, dev);

    CUmodule module;
    CUresult t = cuModuleLoad(&module, "kernel.cubin");

    CUfunction function;
    CUresult r = cuModuleGetFunction(&function, module, "kernel");

    float *W = new float[2];
    W[0] = 0.1f;
    W[1] = 0.1f;
    float *input = new float[2];
    input[0] = 0.1f;
    input[1] = 0.1f;
    float *out = new float[1];
    out[0] = 0.0f;

    int i = 2;
    int o = 1;

    float *d_W;
    float *d_input;
    float *d_out;
    cudaMalloc((void**)&d_W, 2*sizeof(float));
    cudaMalloc((void**)&d_input, 2*sizeof(float));
    cudaMalloc((void**)&d_out, sizeof(float));
    cudaMemcpy(d_W, W, 2*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_input, input, 2*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_out, out, sizeof(float), cudaMemcpyHostToDevice);
    //kernel<<<1, 2>>>(d_W, d_input, i, d_out, o);

    //cudaMemcpy(out, d_out, sizeof(float), cudaMemcpyDeviceToHost);

    //std::cout<<"out:"<<out[0]<<std::endl;

    void * kernelParams[] { &d_W, &d_input, &i, &d_out, &o };

    CUresult k = cuLaunchKernel(function, 1, 1, 1, 2, 1, 1, 0, 0, (void**)kernelParams, 0);

    cudaMemcpy(out, d_out, sizeof(float), cudaMemcpyDeviceToHost);

    std::cout<<"out:"<<out[0]<<std::endl;


}
$ nvcc -cubin -arch=sm_35 --device-c t889_kern.cu -o kernel.cubin -dlink -lcudadevrt -lcublas_device
ptxas info    : 'device-function-maxrregcount' is a BETA feature
$ g++ -std=c++11 -I/usr/local/cuda/include t889.cpp -o t889 -L/usr/local/cuda/lib64 -lcuda -lcudart
$ CUDA_VISIBLE_DEVICES="1" cuda-memcheck ./t889
========= CUDA-MEMCHECK
out:0.02
========= ERROR SUMMARY: 0 errors
$

I have modified my code slightly to only run the cublas in the 0th thread. running cuda-memcheck I get this: cuda-memcheck ./example ========= CUDA-MEMCHECK out:0 ========= ERROR SUMMARY: 0 errors So this is still not working for me.. any ideas? — Bam4d, Aug 24 '15 at 12:06
No. Your modified code works correctly for me (it displays `out:0.02`), with or without `cuda-memcheck`, with or without `-G`. Which CUDA version are you using? Are you running on windows or linux? What GPU? You might want to add some code to check the return value of the cublas calls in your kernel. — Robert Crovella, Aug 24 '15 at 12:19
I have just done that and edited the code here, the codes are OK as far as i can see... Running ubuntu 14.04, cuda toolkit 7.0 and a 980gtx. nvidia-346 drivers. I'm not using it as a my display device (if that makes any difference) — Bam4d, Aug 24 '15 at 12:26
bit of an update, I replaced the `cublasSgemv` with a really basic `cublasSdot` and there doesn't seem to be an issue with it at all. Runs fine with `-G` and not — Bam4d, Aug 24 '15 at 16:15

Cublas not working within kernel once compiled to cubin using -G flag with nvcc

1 Answers1