I am trying to use cudaMemcpy3D to transfer dynamically allocated 3d matrix (tensor). Tensor is allocated as contiguous block of memory (see code below). I tried various combinations of cudaExtent and cudaMemcpy3DParms, however the order of elements gets mixed up. I created the following example to demonstrate the issue:
#include <stdio.h>
int ***alloc_tensor(int Nx, int Ny, int Nz) {
   int i, j;
   int ***tensor;
   tensor = (int ***) malloc((size_t) (Nx * sizeof(int **)));
   tensor[0] = (int **) malloc((size_t) (Nx * Ny * sizeof(int *)));
   tensor[0][0] = (int *) malloc((size_t) (Nx * Ny * Nz * sizeof(int)));
   for(j = 1; j < Ny; j++)
      tensor[0][j] = tensor[0][j-1] + Nz;
   for(i = 1; i < Nx; i++) {
      tensor[i] = tensor[i - 1] + Ny;
      tensor[i][0] = tensor[i - 1][0] + Ny * Nz;
      for(j = 1; j < Ny; j++)
         tensor[i][j] = tensor[i][j - 1] + Nz;
   }
   return tensor;
}
__global__ void kernel(cudaPitchedPtr tensor, int Nx, int Ny, int Nz) {
   int i, j, k;
   char *tensorslice;
   int *tensorrow;
   for (i = 0; i < Nx; i++) {
      for (j = 0; j < Ny; j++) {
         for (k = 0; k < Nz; k++) {
            tensorslice = ((char *)tensor.ptr) + k * tensor.pitch * Nx;
            tensorrow = (int *)(tensorslice + i * tensor.pitch);
            printf("d_tensor[%d][%d][%d] = %d\n", i, j, k, tensorrow[j]);
         }
      }
   }   
}
int main() {
   int i, j, k, value = 0;
   int Nx = 2, Ny = 6, Nz = 4;
   int ***h_tensor;
   struct cudaPitchedPtr d_tensor;
   h_tensor = alloc_tensor(Nx, Ny, Nz);
   cudaMalloc3D(&d_tensor, make_cudaExtent(Nx * sizeof(int), Ny, Nz));
   for(i = 0; i < Nx; i++) {
      for(j = 0; j < Ny; j++) {
         for(k = 0; k < Nz; k++) {
            h_tensor[i][j][k] = value++;
            printf("h_tensor[%d][%d][%d] = %d\n", i, j, k, h_tensor[i][j][k]);
         }
      }
   }
   cudaMemcpy3DParms cpy = { 0 };
   cpy.srcPtr = make_cudaPitchedPtr(h_tensor[0][0], Nx * sizeof(int), Ny, Nz);
   cpy.dstPtr = d_tensor;
   cpy.extent = make_cudaExtent(Nx * sizeof(int), Ny, Nz);
   cpy.kind = cudaMemcpyHostToDevice;
   cudaMemcpy3D(&cpy);
   kernel<<<1, 1>>>(d_tensor, Nx, Ny, Nz);
   // ... clean-up
}
Output for host variable (h_tensor) and device (d_tensor) differ, looking like
h_tensor[0][0][0] = 0
h_tensor[0][0][1] = 1
h_tensor[0][0][2] = 2
h_tensor[0][0][3] = 3
h_tensor[0][1][0] = 4
h_tensor[0][1][1] = 5
h_tensor[0][1][2] = 6
...
d_tensor[0][0][0] = 0
d_tensor[0][0][1] = 12
d_tensor[0][0][2] = 24
d_tensor[0][0][3] = 36
d_tensor[0][1][0] = 1
d_tensor[0][1][1] = 13
d_tensor[0][1][2] = 25
...
What am I doing wrong? What would be the correct way to use cudaMemcpy3D?
 
     
    