I have two cudaArray, a1 and a2 (which have the same size) which reprensent two matrices .
Using texture memory, I want to multiplicate those two cudaArrays . Then I want to copy back the result in one normal arrays,let's name it *a1_h.
The fact is, I just don't know how to do it . I've managed to define, allocate my two cudaArrays and to put floats into them .
Now I want to do a kernel which does those multiplications .
Can somebody help me ?
ROOM_X and ROOM_Y are int, they define width and height of matrices . mytex_M1 and mytex_M2 are texture defined as : texture < float,2,cudaReadModeElementType > .
Here is my main :
int main(int argc, char * argv[]) {
    int size = ROOM_X * ROOM_Y * sizeof(float);
    //creation of arrays on host.Will be useful for filling the cudaArrays
    float *M1_h, *M2_h;
//allocating memories on Host
    M1_h = (float *)malloc(size);
    M2_h = (float *)malloc(size);
//creation of  channel descriptions for 2d texture
cudaChannelFormatDesc channelDesc_M1 = cudaCreateChannelDesc<float>();
cudaChannelFormatDesc channelDesc_M2 = cudaCreateChannelDesc<float>();
//creation of 2 cudaArray * . 
cudaArray *M1_array,*M2_array;
//bind arrays and channel in order to allocate space
cudaMallocArray(&M1_array,&channelDesc_M1,ROOM_X,ROOM_Y);
cudaMallocArray(&M2_array,&channelDesc_M2,ROOM_X,ROOM_Y);
//filling the matrices on host
Matrix(M1_h);
Matrix(M2_h);
//copy from host to device (putting the initial values of M1 and M2 into the arrays)
 cudaMemcpyToArray(M1_array, 0, 0,M1_h, size,cudaMemcpyHostToDevice);
 cudaMemcpyToArray(M2_array, 0, 0,M2_h, size,cudaMemcpyHostToDevice);
//set textures parameters 
mytex_M1.addressMode[0] = cudaAddressModeWrap;
mytex_M1.addressMode[1] = cudaAddressModeWrap;
mytex_M1.filterMode = cudaFilterModeLinear;
mytex_M1.normalized = true; //NB coordinates in [0,1]
mytex_M2.addressMode[0] = cudaAddressModeWrap;
mytex_M2.addressMode[1] = cudaAddressModeWrap;
mytex_M2.filterMode = cudaFilterModeLinear;
mytex_M2.normalized = true; //NB coordinates in [0,1]
//bind arrays to the textures 
cudaBindTextureToArray(mytex_M1,M1_array);  
cudaBindTextureToArray(mytex_M2,M2_array);
//allocate device memory for result
float* M1_d;
cudaMalloc( (void**)&M1_d, size);
//dimensions of grid and blocks
dim3 dimGrid(ROOM_X,ROOM_Y);
dim3 dimBlock(1,1);
//execution of the kernel . The result of the multiplication has to be put in M1_d
mul_texture<<<dimGrid, dimBlock >>>(M1_d);
//copy result from device to host
cudaMemcpy(M1_h,M1_d, size, cudaMemcpyDeviceToHost);
//free memory on device
cudaFreeArray(M1_array);
cudaFreeArray(M2_array);
cudaFree(M1_d);
//free memory on host
free(M1_h);
free(M2_h);
return 0;
}
 
     
    