I am NEW to java programming and trying to code a matrix multiplication program in jCUDA.
While transferring the data from host to device and vice versa I use:
cuMemcpyHtoD(devMatrixA, Pointer.to(hostMatrixA), numRows * numCols * Sizeof.FLOAT);
cuMemcpyHtoD(devMatrixB, Pointer.to(hostMatrixA), numRows * numCols * Sizeof.FLOAT);
cuMemcpyDtoH(Pointer.to(hostMatrixC), devMatrixC, numRows * numCols * Sizeof.FLOAT);
Here, the devMatrixA, devMatrixB and devMatrixC are the matrices to be stored on device memory. And hostMatrixA, hostMatrixB and hostMatrixC are the matrices stored on my Host memory.
When I call above functions for data transfer, it gives me following error 'The method to(byte[]) in the type Pointer is not applicable for the arguments (float[][])' with 'to' in 'Pointer.to(' is red underlined. I am using eclipse. I have given my complete code as below.
Pardon my java knowledge, and please suggest if I am going into wrong direction.
Package JCudaMatrixAddition;
import static jcuda.driver.JCudaDriver.*;
import java.io.*;
import jcuda.*;
import jcuda.driver.*;
import jcuda.Pointer;
import jcuda.Sizeof;
public class JCudaMatrixAddition {
    public static void main(String[] args) throws IOException 
    {
        // Enable exceptions and omit all subsequent error checks
        JCudaDriver.setExceptionsEnabled(true);
        // Create the PTX file by calling the NVCC
        String ptxFilename = preparePtxFile("JCudaMatrixAdditionKernel.cu");
        //Initialize the driver and create a context for the first device.
        cuInit(0);
        CUdevice device = new CUdevice();
        cuDeviceGet (device, 0);
        CUcontext context = new CUcontext();
        cuCtxCreate(context, 0, device);
        //Load PTX file
        CUmodule module = new CUmodule();
        cuModuleLoad(module,ptxFilename);
        //Obtain a function pointer to the Add function
        CUfunction function = new CUfunction();
        cuModuleGetFunction(function, module, "add");
        int numRows = 32;
        int numCols = 32;
        //Allocate and fill Host input Matrices:
        float hostMatrixA[][] = new float[numRows][numCols];
        float hostMatrixB[][] = new float[numRows][numCols];
        float hostMatrixC[][] = new float[numRows][numCols];
        for(int i = 0; i<numRows; i++)
        {
            for(int j = 0; j<numCols; j++)
            {
                hostMatrixA[i][j] = (float) 1.0;
                hostMatrixB[i][j] = (float) 1.0;
            }
        }
        // Allocate the device input data, and copy the
        // host input data to the device
        CUdeviceptr devMatrixA = new CUdeviceptr();
        cuMemAlloc(devMatrixA, numRows * numCols * Sizeof.FLOAT);
        //This is the part where it gives me the error
        cuMemcpyHtoD(devMatrixA, Pointer.to(hostMatrixA), numRows * numCols * Sizeof.FLOAT);
        CUdeviceptr devMatrixB = new CUdeviceptr();
        cuMemAlloc(devMatrixB, numRows * numCols * Sizeof.FLOAT);
        //This is the part where it gives me the error
        cuMemcpyHtoD(devMatrixB, Pointer.to(hostMatrixA), numRows * numCols * Sizeof.FLOAT);
        //Allocate device matrix C to store output
        CUdeviceptr devMatrixC = new CUdeviceptr();
        cuMemAlloc(devMatrixC, numRows * numCols * Sizeof.FLOAT);
        // Set up the kernel parameters: A pointer to an array
        // of pointers which point to the actual values.
        Pointer kernelParameters = Pointer.to(Pointer.to(new int[]{numRows}),
                                   Pointer.to(new int[]{numRows}), 
                                   Pointer.to(devMatrixA),
                                   Pointer.to(devMatrixB),
                                   Pointer.to(devMatrixC));
        //Kernel thread configuration
        int blockSize = 32;
        int gridSize = 1;
        cuLaunchKernel(function, 
                       gridSize, 1, 1,
                       blockSize, 32, 1,
                       0, null, kernelParameters, null);
        cuCtxSynchronize();
        // Allocate host output memory and copy the device output
        // to the host.
        //This is the part where it gives me the error
        cuMemcpyDtoH(Pointer.to(hostMatrixC), devMatrixC, numRows * numCols * Sizeof.FLOAT);
        //verify the result
        for (int i =0; i<numRows; i++)
        {
            for (int j =0; j<numRows; j++)
            {
                System.out.print("   "+ hostMatrixB[i][j]);
            }
            System.out.println("");
        }
        cuMemFree(devMatrixA);
        cuMemFree(devMatrixB);
        cuMemFree(devMatrixC);
    }