I am new to cuda and am trying to parallelize a very simple program shown below that was inspired from this link: https://devblogs.nvidia.com/even-easier-introduction-cuda/
typedef struct{
    int temp;
    int newtemp;
    int neighbors[20];
} S;
void add(int n, S * s){
    for(int i = 0; i < n; i++){
        int newTemp = 0;
        for(int j = 0; j < 20; j++){
            newTemp += s[s[i].neighbors[j]].temp;
        }
        newTemp /= 3;
        s[i].newtemp = newTemp;
    }
}
int main(int argc, char *argv[]){
    int n = 1<<21;
    S grid[n];
    for(int i = 0; i < n; i++){
        S tmp1;
        tmp1.temp = rand();
        for(int j = 0; j<20; j++){
            tmp1.neighbors[j] = rand()%n;
        }
        grid[i] = tmp1;
    }
    struct timespec start, end;
    double gettime_diff, time_diff; 
    clock_t t, starttime, endtime; 
    clock_gettime(CLOCK_REALTIME, &start);
    t = clock(); 
    time(&starttime);
    add(n,grid);
    for(int i = 0; i < n; i++){
        grid[i].temp = grid[i].newtemp;
        if(i%83940==1)printf("%d\n",grid[i].temp);
    }
    return 0;
}
I am not getting desired results however as when I am updating temp all the new values are 0. I think that the issue is because the array of structs I am passing to my add function cannot be accessed in device memory. I, however, am having a hard time figuring out how to fix this. I found this post on stackoverflow and am a little unsure what the suggested answer did to fix the issue: Array of structs of arrays CUDA C
The cuda code I have for reference is here:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#define SIZE 1000
#define NS_PER_US 1000
typedef struct{
    int temp;
    int newtemp;
    int neighbors[20];
} S;
__global__ void add(int n, S * s){
    int index = threadIdx.x;
    int stride = blockDim.x;
    //printf("%d\n",(n-index)/stride);
    //printf("%d\n",s[0].temp);
    for(int i = index; i < n; i+=stride){
        printf("%d\n",index);
        int newTemp = 0;
        for(int j = 0; j < 20; j++){
            newTemp += s[s[i].neighbors[j]].temp;
        }
        printf("%d\n",index);
        newTemp /= 3;
        s[i].newtemp = newTemp;
    }
}
int main(int argc, char *argv[]){
    int  *h_a;
    int  *d_a;
    int  num_blocks= 2;
    int  num_th_per_blk= 5;
    int n = 1<<21;
    S grid[n];
    for(int i = 0; i < n; i++){
        S tmp1;
        tmp1.temp = rand();
        for(int j = 0; j<20; j++){
            tmp1.neighbors[j] = rand()%n;
        }
        grid[i] = tmp1;
    }
    struct timespec start, end;
    double gettime_diff, time_diff; 
    clock_t t, starttime, endtime; 
    clock_gettime(CLOCK_REALTIME, &start);
    t = clock(); 
    time(&starttime);
    size_t  memSize;
    memSize = num_blocks* num_th_per_blk* sizeof(int);
    h_a= (int*) malloc(memSize);
    cudaMallocManaged((void **)&grid, n * sizeof(S));
    cudaMalloc( (void**) &d_a, memSize);
    dim3  dimGrid(num_blocks);
    dim3  dimBlock(num_th_per_blk);    
    add<<< dimGrid, dimBlock >>>(n,grid);
    cudaMemcpy( h_a, d_a, memSize,cudaMemcpyDeviceToHost);
    for(int i = 0; i < n; i++){
        grid[i].temp = grid[i].newtemp;
        if(i%83940==1)printf("%d\n",grid[i].newtemp);
    }
    clock_gettime(CLOCK_REALTIME, &end); 
    t = clock() - t; 
    time(&endtime);
    gettime_diff = (double) ((end.tv_sec - start.tv_sec)*CLOCKS_PER_SEC) + (double)((end.tv_nsec - start.tv_nsec)/NS_PER_US);
    time_diff = difftime(endtime, starttime);
    printf("\ttime (clock_gettime) %f\n", gettime_diff);
    printf("\ttime (clock) %f\n", ((float)t)/CLOCKS_PER_SEC);
    printf("\ttime (time) %f\n", time_diff); 
    return 0;
}
I feel like there is a simple fix here that I am not seeing, or maybe I am missing a key concept. Whatever the case any help would be greatly appreciated.
 
    