I have a GPU-only class T which I want to create on GPU but have a reference to which on the CPU, so I can send the link as an argument to different CUDA kernels.
class T
{
public:
    int v;
public:
    __device__ T() { v = 10; }
    __device__ ~T() {}
    __device__ int compute() { return v; }
};
Here are the kernels that I was to create the class instance and to call the compute() function.
__global__ void kernel(T* obj, int* out)
{
    if(blockIdx.x * blockDim.x + threadIdx.x == 0) {
        out[0] = obj->compute(); // no kernel error, but it returns garbage
    }
}
__global__ void cudaAllocateGPUObj(T* obj)
{
    if(blockIdx.x * blockDim.x + threadIdx.x == 0) {
        obj = new T;
        // if I call `out[0] = obj->compute();` here, everything works fine
    }
}
The main function simply allocates memory for the pointer of type T* which later is used as an argument for the cudaAllocateGPUObj.
int main()
{
    int cpu, *gpu;
    cudaMalloc((void**)&gpu, sizeof(int));
    T* obj;
    cudaMalloc((void**)&obj, sizeof(T*));
    cudaAllocateGPUObj<<<1,1>>>(obj);
    kernel<<<1,1>>>(obj, gpu);
    cudaMemcpy(&cpu, gpu, sizeof(int), cudaMemcpyDeviceToHost);
    cudaDeviceSynchronize();
    printf("cudaMemcpy\nresult: %d\n", cpu);
    return 0;
}
The problem with this code (as specified in the comments in the code) is that when I call out[0] = obj->compute(); in the cudaAllocateGPUObj kernel and transfer the obtained value to the CPU, everything is correct. But if I want to obtain the member value in another kernel, it becomes garbage, though if I change the return value from the v variable to a constant, everything works fine.
Could you please tell me what is wrong with this code.