I wrote a simple code to understand Dynamic Parallelism. From the values being printed,I see that the child kernel has executed correctly, but when I come back to the parent kernel, I see wrong values being used in place of temp array which is being updated correctly in the child kernel. When I try to update the 'd_cin array' it is giving me wrong values. These are the compilation flags being used :
nvcc -m64 -dc  -gencode arch=compute_35,code=sm_35  -I/opt/apps/cuda/5.5/include -I. -I.. -I../../common/inc -o simple.o -c simple.cu
nvcc -m64 -gencode arch=compute_35,code=sm_35  -o simple simple.o -L/opt/apps/cuda/5.5/lib64 -lcudadevrt
Can someone help me ? Here is the code.
#include <stdio.h>
#include "cuPrintf.cu"
#include "cuPrintf.cuh"
__global__ void innerKernel(double *I,double *d_temp,int parentIndex){
    int index=threadIdx.x+blockIdx.x*blockDim.x;
    d_temp[parentIndex*3+index]=I[parentIndex];
}
__global__ void kernel(double *d_I,double *d_temp,double *d_cin){
    int index=threadIdx.x+blockIdx.x*blockDim.x;
    int i;
    double res=0.0;
        if(index<30){
    cudaStream_t s;
        cudaStreamCreateWithFlags( &s, cudaStreamNonBlocking );
    dim3 dimBlock(3,1,1);
    dim3 dimGrid(1,1,1);
    innerKernel<<<dimGrid,dimBlock>>>(d_I,d_temp,index);
        __syncthreads();
    if(index==0){
        for(i=0;i<90;i++)
            cuPrintf("temp[%d]: %f\n",i,d_temp[i]);
    }   
    for (i=0;i<3;i++){
            res=res+d_temp[index*3+i];
    }
        __syncthreads();
    d_cin[index]=res;
        cudaStreamDestroy(s);
    }
}
int main(int argc,char **argv){
    double I[30]={1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30};
    double *d_I;
    double *d_temp;
    double *d_cin;
    double cout[30];
    cudaMalloc(&d_I,30*sizeof(double));
    cudaMemcpy(d_I,I,30*sizeof(double),cudaMemcpyHostToDevice);
    cudaMalloc(&d_temp,3*30*sizeof(double));
    cudaMalloc(&d_cin,30*sizeof(double));
    dim3 dimBlock(8,1,1);
    dim3 dimGrid(4,1,1);
    /*LAUNCH THE KERNEL*/
    printf("Before the kernel\n");
    cudaPrintfInit();
    kernel<<<dimGrid,dimBlock>>>(d_I,d_temp,d_cin);
    //cudaThreadSynchronize();
    cudaPrintfDisplay(stdout,true);
    cudaPrintfEnd();
    printf("After the kernel\n");
    cudaMemcpy(cout,d_cin,30*sizeof(double),cudaMemcpyDeviceToHost);
    int i;
    for(i=0;i<30;i++)
       printf("%f\n",cout[i]);
}
 
     
    