I have a structure with arrays of structures inside in C, and I need a copy of that in the GPU. For that I am writing a function that makes some cudaMalloc and cudaMemcpys of the variables in the struct from host to device.
A simple version (the real one has various structs and variables/arrays inside) of the struct is:
struct Node {
float* position;
};
struct Graph{
unsigned int nNode;
Node* node;
unsigned int nBoundary;
unsigned int* boundary;
};
My problem is that I must be doing something wrong in the memory allocation and copy of the struct. When I copy the variables withing Graph, I can see that they are properly copied (by accessing it in a kernel as in the example below). For example, I can check that graph.nBoundary=3.
However, I can only see this if I do not allocate and copy the memory of Node *. If I do, I get -858993460 instead of 3. Interestingly, Node * is not wrongly allocated, as I can inspect the value of say graph.node[0].pos[0] and it has the correct value.
This only happens with the graph.nBoundary. All the other variables remain with the correct numerical values, but this one gets "wronged" when running the cudaMemcpy of the Node*.
What am I doing wrong and why does this happen? How do I fix it?
Let me know if you need more information.
MCVE:
#include <algorithm>
#include <cuda_runtime_api.h>
#include <cuda.h>
// A point, part of some elements
struct Node {
float* position;
};
struct Graph{
unsigned int nNode;
Node* node;
unsigned int nBoundary;
unsigned int* boundary;
};
Graph* cudaGraphMalloc(const Graph* inGraph);
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void testKernel(Graph* graph,unsigned int * d_res){
d_res[0] = graph->nBoundary;
};
int main()
{
// Generate some fake data on the CPU
Graph graph;
graph.node = (Node*)malloc(2 * sizeof(Node));
graph.boundary = (unsigned int*)malloc(3 * sizeof(unsigned int));
for (int i = 0; i < 3; i++){
graph.boundary[i] = i + 10;
}
graph.nBoundary = 3;
graph.nNode = 2;
for (int i = 0; i < 2; i++){
// They can have different sizes in the original code
graph.node[i].position = (float*)malloc(3 * sizeof(float));
graph.node[i].position[0] = 45;
graph.node[i].position[1] = 1;
graph.node[i].position[2] = 2;
}
// allocate GPU memory
Graph * d_graph = cudaGraphMalloc(&graph);
// some dummy variables to test on GPU.
unsigned int * d_res, *h_res;
cudaMalloc((void **)&d_res, sizeof(unsigned int));
h_res = (unsigned int*)malloc(sizeof(unsigned int));
//Run kernel
testKernel << <1, 1 >> >(d_graph, d_res);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaMemcpy(h_res, d_res, sizeof(unsigned int), cudaMemcpyDeviceToHost));
printf("%u\n", graph.nBoundary);
printf("%d", h_res[0]);
return 0;
}
Graph* cudaGraphMalloc(const Graph* inGraph){
Graph* outGraph;
gpuErrchk(cudaMalloc((void**)&outGraph, sizeof(Graph)));
//copy constants
gpuErrchk(cudaMemcpy(&outGraph->nNode, &inGraph->nNode, sizeof(unsigned int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(&outGraph->nBoundary, &inGraph->nBoundary, sizeof(unsigned int), cudaMemcpyHostToDevice));
// copy boundary
unsigned int * d_auxboundary, *h_auxboundary;
h_auxboundary = inGraph->boundary;
gpuErrchk(cudaMalloc((void**)&d_auxboundary, inGraph->nBoundary*sizeof(unsigned int)));
gpuErrchk(cudaMemcpy(d_auxboundary, h_auxboundary, inGraph->nBoundary*sizeof(unsigned int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(&outGraph->boundary, d_auxboundary, sizeof(unsigned int *), cudaMemcpyDeviceToDevice));
//Create nodes
Node * auxnode;
gpuErrchk(cudaMalloc((void**)&auxnode, inGraph->nNode*sizeof(Node)));
// Crate auxiliary pointers to grab them from host and pass them to device
float ** d_position, ** h_position;
d_position = static_cast<float **>(malloc(inGraph->nNode*sizeof(float*)));
h_position = static_cast<float **>(malloc(inGraph->nNode*sizeof(float*)));
for (int i = 0; i < inGraph->nNode; i++){
// Positions
h_position[i] = inGraph->node[i].position;
gpuErrchk(cudaMalloc((void**)&d_position[i], 3 * sizeof(float)));
gpuErrchk(cudaMemcpy(d_position[i], h_position[i], 3 * sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(&auxnode[i].position, d_position[i], sizeof(float *), cudaMemcpyDeviceToDevice));
}
///////////////////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////////
////////////// If I comment the following section, nBoundary can be read by the kernel
///////////////////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////////
gpuErrchk(cudaMemcpy(&outGraph->node, auxnode, inGraph->nNode*sizeof(Node *), cudaMemcpyDeviceToDevice));
return outGraph;
}