I'm passing a matrix from host to device and trying to store it on the gpu memory. to test if it really works I'm copying the first row from the matrix in the gpu to the host. but after it returns from the wrapper it prints garbage values.
C file:
int col;
srand(time(NULL));
matrix = (int**) malloc(10*sizeof(int*));
for(int j = 0; j < 10; j++)
{
    col = 3 + (rand() % 7);
    matrix[j] = (int*) malloc(sizeof(int)*col);
    matrix[j][0] = col-1;
    for(int i = 1; i < col; i++)
    {
       matrix[j][i] = i;
    }
}
int first_row[10];
int rows = 10;
pass_matrix_kernel_wrapper(matrix, &rows); 
foo_wrapper(first_row); // get the first row of the matrix from the gpu
for(int i = 0; i < matrix[0][0]; i++)
{
    printf("%d, ", first_row[i]);
}
Cuda file:
__shared__ int **gpu_matrix;
__shared__ int gpu_rows;
void pass_matrix_kernel_wrapper(int** matrix, int* rows)
{
    cudaMalloc((void***)(&gpu_matrix), sizeof(int*) * (*rows));
    for (int i = 0; i < *rows; i++)
    {
    int cols = matrix[i][0] + 1;
        int* temp;
        cudaMalloc( (void**)  &(temp), sizeof(int) * cols); // allocate for 1 int in    each int pointer
        cudaMemcpy(temp, matrix[i], sizeof(int) * cols, cudaMemcpyHostToDevice); // copy data
        cudaMemcpy(gpu_matrix+i, &temp, sizeof(int*) * cols, cudaMemcpyHostToDevice);
    }
}
void foo_wrapper(int* back)
{
    int* temp;
    cudaMalloc( (void**)  &(temp), sizeof(int) * 11); // allocate for 1 int in each int pointer
    test_kernel<<<1,1>>>(temp); // just checking if it works
    cudaDeviceSynchronize();
    int size = temp[0] + 1;
    cudaMemcpy(back, &temp, sizeof(int) * size, cudaMemcpyDeviceToHost);
}
__global__ void test_kernel(int* back)
{
    for(int i = 0;i < gpu_matrix[0][0] + 1;i++) // gpu_matrix[0][0] stores num of cols in row
    {
        back[i] = gpu_matrix[0][i];
    }
}
