i've been working in a program that requires to use array of structs inside another array of structs or structure of arrays, i decided to use this approach given the initial conditions (there are dynamic), the following are the structs that i'm trying to allocate in CUDA
struct population
{
    int id;
    tour *tours;
};
struct tour
{
    int id;
    node *nodes;
    double value;
    int node_qty;
};
struct node 
{
    int id;
    double x;
    double y;
    int item_qty;
    item *items;
};
struct item 
{
    float weight;
    float value;
};
As you can see, this group of structures are one inside another and as i have said most of the properties are dynamic (P.E: the amount of nodes, the amount of items and the amount of tours). I have made many attempts to allocate memory but the result is almost always the same "Access violation writing location". As a side note i have tried to follow some advices from other questions like this: cudaMemcpy segmentation fault or this Memory allocation on GPU for dynamic array of structs.
The following code allocates most of the memory, but when i try to access the properties of the structures the result is "an illegal memory access"
// 1. cudaMalloc a pointer to device memory that hold population
population* d_initial_population;
HANDLE_ERROR(cudaMalloc((void**)&d_initial_population, sizeof(population)));
    
// 2. Create a separate tour pointer on the host.
tour* d_tour_ptr;
HANDLE_ERROR(cudaMalloc((void**)&d_tour_ptr, sizeof(tour) * POPULATION_SIZE));
   
// 3. Create a separate node pointer on the host.
node* d_node_ptr[POPULATION_SIZE];
   
// Allocate memory on device according to population size
for (int i = 0; i < POPULATION_SIZE; ++i)
{
    // 4. cudaMalloc node storage on the device for node pointer
    HANDLE_ERROR(cudaMalloc((void**)&(d_node_ptr[i]), sizeof(node) * node_quantity));
    // 5. cudaMemcpy the pointer value of node pointer from host to the device node pointer
    HANDLE_ERROR(cudaMemcpy(&(d_tour_ptr[i].nodes), &(d_node_ptr[i]), sizeof(node*), cudaMemcpyHostToDevice));
    // Optional: Copy an instantiated object on the host to the device pointer
    HANDLE_ERROR(cudaMemcpy(d_node_ptr[i], initial_tour.nodes, sizeof(node) * node_quantity, cudaMemcpyHostToDevice));
}
// 6. cudaMemcpy the pointer value of tour pointer from host to the device population pointer
HANDLE_ERROR(cudaMemcpy(&(d_initial_population->tours), &d_tour_ptr, sizeof(tour*), cudaMemcpyHostToDevice));
After this initial approach my next attempt was trying to allocate the inner structures first and then go upwards, my attempt was with the node and item structs as follows
// Define a pointer for struct "node"
node* dev_node; 
// 1. cudaMalloc a pointer to device memory that will hold the struct "node", in this case is called "dev_node"
HANDLE_ERROR(cudaMalloc((void**)&dev_node, node_quantity * sizeof(node)));
// 2. (optionally) copy an instantiated object of struct "node" on the host to the device pointer "dev_node" from step 1 using cudaMemcpy
HANDLE_ERROR(cudaMemcpy(dev_node, n, node_quantity * sizeof(node), cudaMemcpyHostToDevice));
// 3. Create a separate "item" pointer on the host, in this case it's called "dev_item"
item* dev_item;
// 4. cudaMalloc "item" storage on the device for "dev_item"
HANDLE_ERROR(cudaMalloc((void**)&dev_item, node_quantity));
for (int i = 0; i < node_quantity; i++)
{
    HANDLE_ERROR(cudaMalloc((void**)&(dev_item[i]), sizeof(item)* initial_tour.nodes[i].item_qty));
}
// 5. cudaMemcpy the pointer value of "dev_item" from the host to the device pointer &(dev_node->i)
for (int i = 0; i < node_quantity; i++)
{
    HANDLE_ERROR(cudaMemcpy(&(dev_node[i].items), &(dev_item[i]), sizeof(item*), cudaMemcpyHostToDevice));
}
// 6. Copy the embedded data
for (int i = 0; i < node_quantity; i++)
{
    HANDLE_ERROR(cudaMemcpy(&dev_item[i], n[i].items, sizeof(item) * dev_node[i].item_qty, cudaMemcpyHostToDevice));
}
But this last attempt gives me an Access violation writing location 0x0000000B00700C00 in the following line
HANDLE_ERROR(cudaMalloc((void**)&(dev_item[i]), sizeof(item)* initial_tour.nodes[i].item_qty));
I suppose that the error(s) are associated to some kind of missing or bad memory allocation but i haven't been able to figure it out where.
UPDATE 1: After some research as indicated by talonmies I have done a simplified version of my code only to solve this but still doesn't work.
This is my new code:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
struct item
{
    int id;
    float weight;
    float value;
    int node;
    int taken;
};
struct node
{
    int id;
    double x;
    double y;
    int item_qty;
    item* items;
};
struct tour
{
    int id;
    int node_qty;
    node* nodes;
};
struct population
{
    int id;
    tour* tours;
};
static void HandleError(cudaError_t err, const char* file, int line)
{
    if (err != cudaSuccess) {
        printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
        getchar();
        exit(EXIT_FAILURE);
    }
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
    
int main()
{
    // Get user defined values
    int population_size, tour_size, node_size, item_size;
    printf("Enter values for amount of population, amount of tours, amount of nodes and amount of items:\n");
    // For this exercise the values are 1 10 5 4
    scanf("%i %i %i %i", &population_size, &tour_size, &node_size, &item_size);
    printf("\n");
    printf("The values are: %i %i %i %i\n", population_size, tour_size, node_size, item_size);
#pragma region ALLOCATE CPU MEMORY
    // Declare pointers
    population* host_population;
    tour* host_tour;
    node* host_node;
    item* host_item;
    // Allocate host memory for population
    host_population = (population*)malloc(sizeof(population) * population_size);
    for (int p = 0; p < population_size; p++)
    {
        host_population[p].tours = (tour*)malloc(sizeof(tour) * tour_size);
    }
    // Allocate host memory for tour
    host_tour = (tour*)malloc(sizeof(tour) * tour_size);
    for (int t = 0; t < tour_size; t++)
    {
        host_tour[t].nodes = (node*)malloc(sizeof(node) * node_size);
    }
    // Allocate host memory for node
    host_node = (node*)malloc(sizeof(node) * node_size);
    for (int n = 0; n < node_size; n++)
    {
        host_node[n].items = (item*)malloc(sizeof(item) * item_size);
    }
    // Allocate memory for item
    host_item = (item*)malloc(sizeof(item) * item_size);
#pragma endregion
#pragma region FILL CPU DATA
    //Fill the full structure with information, for test purposes these values are going to be taken
    // 1. Item Data
    int item_id[4] = { 1,2,3,4 };
    float item_value[4] = { 300,50,30,40 };
    float item_weight[4] = { 400,200,40,2 };
    int item_node[4] = { 3,4,5,2 };
    // 2. Node Data
    int node_id[5] = { 1,2,3,4,5 };
    double node_x[5] = { 0,6,14,11,7 };
    double node_y[5] = { 0,-5,5,13,5 };
    int node_item[5] = { 0,1,1,1,1 };
    // 3. Tour Data
    int tour_id[10] = { 1,2,3,4,5,6,7,8,9,10 };
    // 4. Population Data
    int population_id = 1;
    for (int i = 0; i < item_size; i++)
    {
        host_item[i].id = item_id[i];
        host_item[i].value = item_value[i];
        host_item[i].taken = rand() % 2;
        host_item[i].node = item_node[i];
        host_item[i].weight = item_weight[i];
    }
    for (int n = 0; n < node_size; n++)
    {
        host_node[n].id = node_id[n];
        host_node[n].x = node_x[n];
        host_node[n].y = node_y[n];
        host_node[n].item_qty = node_item[n];
        for (int i = 0; i < item_size; i++)
        {
            if (host_node[n].id == host_item[i].node)
            {
                memcpy(host_node[n].items, &host_item[i], sizeof(item) * node_item[n]);
            }
        }
    }
    for (int t = 0; t < tour_size; t++)
    {
        host_tour[t].id = tour_id[t];
        host_tour[t].node_qty = node_size;
        memcpy(host_tour[t].nodes, host_node, sizeof(node) * node_size);
    }
    for (int p = 0; p < population_size; p++)
    {
        host_population[p].id = population_id;
        memcpy(host_population[p].tours, host_tour, sizeof(tour) * tour_size);
    }
    //printStructure(host_population, population_size, tour_size);
#pragma endregion
    population* device_population;
    tour *device_tour;
    node* device_node;
    item* device_item;
    // Allocate host memory for population
    HANDLE_ERROR(cudaMalloc((void**)&device_population, sizeof(population) * population_size));
    // Allocate host memory for tour
    HANDLE_ERROR(cudaMalloc((void**)&device_tour, sizeof(tour*) * population_size));
    for (int p = 0; p < population_size; p++)
    {
        HANDLE_ERROR(cudaMalloc((void**)&(device_tour[p]), sizeof(tour) * tour_size));
    }
    return 0;
}
After this exercise i have a more specific question to ask: What's the difference between this:
    // Allocate host memory for tour
    HANDLE_ERROR(cudaMalloc((void**)&device_tour, sizeof(tour*) * population_size));
    for (int p = 0; p < population_size; p++)
    {
        HANDLE_ERROR(cudaMalloc((void**)&(device_tour[p]), sizeof(tour) * tour_size));
    }
And this
// Allocate host memory for tour
    device_tour[1];
    for (int p = 0; p < population_size; p++)
    {
        HANDLE_ERROR(cudaMalloc((void**)&(device_tour[p]), sizeof(tour) * tour_size));
    }
The second block works without any issue, but the first one returns an exception "Access violation writing location". Is there a way to make the first block work without using static arrays?
