I have a CUDA kernel that looks like the following:
#include <cublas_v2.h>
#include <math_constants.h>
#include <stdio.h>
extern "C" {
    __device__ float ONE = 1.0f;
    __device__ float M_ONE = -1.0f;
    __device__ float ZERO = 0.0f;
    __global__ void kernel(float *W, float *input, int i, float *output, int o) {
        int idx = blockIdx.x*blockDim.x+threadIdx.x;
        cublasHandle_t cnpHandle;
        if(idx == 0) {
            cublasCreate(&cnpHandle);
            cublasStatus_t s = cublasSgemv(cnpHandle, CUBLAS_OP_N, o, i, &ONE, W, o, input, 1, &ZERO, output, 1);
            printf("status %d\n", s);
            cudaError_t e = cudaDeviceSynchronize();
            printf("sync %d\n", e);
        }
    }
}
The host code:
#include <iostream>
#include <numeric>
#include <stdlib.h>
#include <cstring>
#include <cuda_runtime.h>
#include <cuda.h>
#include <cublas_v2.h>
extern "C" {
    __global__ void kernel(float *W, float *input, int i, float *output, int o);
}
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess)
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}
int main(int argc, char* argv[])
{
    cuInit(0);
    CUcontext pctx;
    CUdevice dev;
    cuDeviceGet(&dev, 0);
    cuCtxCreate(&pctx, 0, dev);
    CUmodule module;
    CUresult t = cuModuleLoad(&module, "pathto/src/minimalKernel.cubin");
    CUfunction function;
    CUresult r = cuModuleGetFunction(&function, module, "kernel");
    float *W = new float[2];
    W[0] = 0.1f;
    W[1] = 0.1f;
    float *input = new float[2];
    input[0] = 0.1f;
    input[1] = 0.1f;
    float *out = new float[1];
    out[0] = 0.0f;
    int i = 2;
    int o = 1;
    float *d_W;
    float *d_input;
    float *d_out;
    cudaMalloc((void**)&d_W, 2*sizeof(float));
    cudaMalloc((void**)&d_input, 2*sizeof(float));
    cudaMalloc((void**)&d_out, sizeof(float));
    cudaMemcpy(d_W, W, 2*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_input, input, 2*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_out, out, sizeof(float), cudaMemcpyHostToDevice);
    //kernel<<<1, 2>>>(d_W, d_input, i, d_out, o);
    //cudaMemcpy(out, d_out, sizeof(float), cudaMemcpyDeviceToHost);
    //std::cout<<"out:"<<out[0]<<std::endl;
    void * kernelParams[] { &d_W, &d_input, &i, &d_out, &o };
    CUresult k = cuLaunchKernel(function, 1, 1, 1, 2, 1, 1, 0, 0, (void**)kernelParams, 0);
    gpuErrchk( cudaPeekAtLastError() );
    gpuErrchk( cudaDeviceSynchronize() );
    cudaMemcpy(out, d_out, sizeof(float), cudaMemcpyDeviceToHost);
    std::cout<<"out:"<<out[0]<<std::endl;
}
When this kernel runs inline kernel<<<1,2>>>(), built and linked (within eclipse Nsight), the kernel runs completely fine and out returns 0.02
as expected.
If I compile the kernel into a .cubin using -G (generate device debugging symbols), the cublas function never runs, and the out is always 0.0
I can put breakpoints in when the .cubin is running and I can see the data is correct going into the cublas function, but it looks like the cublas function never runs at all. The cublas function also always is returning 0 CUDA_SUCCESS. Importantly this ONLY happens when running this from a .cubin
To compile to a cubin I am using with the -G:
nvcc -G -cubin -arch=sm_52 --device-c kernel.cu -o kernel.cubin -dlink -lcudadevrt -lcublas_device
which returns no errors.
Why would the cublas functions within the .cubin stop working if the -G option is added?
CUDA 7.0 linux 14.04 x64 980GTX
 
     
     
    