This is the first time I am implementing structures in CUDA. In the following program I am copying a structure to the GPU and performing a basic operation on the data, and copying back the result to the Host.
#include<stdio.h>
inline cudaError_t checkCuda(cudaError_t result)
{
    #if defined(DEBUG) || defined(_DEBUG)
        if (result != cudaSuccess) {
            fprintf(stderr, "CUDA Runtime Error: %sn", cudaGetErrorString(result));
        assert(result == cudaSuccess);
        }
    #endif
    return result;
}
typedef struct myStruct {
    int* a;
    int b;
}MyStruct;
__global__ void structOperation(MyStruct *d_data){
    int idx = threadIdx.x;
    d_data->a[idx] += 10;
}
int main(){
    MyStruct *h_data, *d_data, *out_data;
    size_t structSize = sizeof(MyStruct);
    size_t intSize = sizeof(int);
    h_data = (MyStruct *) malloc(structSize * 1);
    h_data->b = 32;
    h_data->a = (int *)malloc(intSize * h_data->b);
    out_data = (MyStruct *) malloc(structSize * 1);
    out_data->b = 32;
    out_data->a = (int *)malloc(intSize * out_data->b);
    for(int i = 0; i<32; i++){
        h_data->a[i] = i;   
    }
    //Memory allocation for the Struct
    checkCuda(cudaMalloc(&d_data, sizeof(MyStruct) * 1));
    checkCuda(cudaMalloc(&(d_data->a), sizeof(int) * 32));
    checkCuda(cudaMemcpy(&d_data, &h_data, sizeof(MyStruct) * 1, cudaMemcpyHostToDevice));
    checkCuda(cudaMemcpy(&(d_data->a), &(h_data->a), sizeof(int) * 32, cudaMemcpyHostToDevice)); 
    structOperation<<<1,32>>>(d_data);
    checkCuda(cudaMemcpy(&out_data, &d_data, sizeof(myStruct) * 1, cudaMemcpyDeviceToHost));
  //cudaMemcpy(&(out_data->a), &(d_data->a), sizeof(int) * d_data->b, cudaMemcpyDeviceToHost); 
    printf("\nDataElements : ");
    for(int i = 0; i<32; i++){
        printf("    %d",out_data->a[i]);
    }
    printf("\n");
}
I am getting 'Segmentation Fault' as the result of execution. I guess I am operating the structure incorrectly. What is the proper way to implement?
 
    