I've attempted to copy the matrix [1 2 3 4 ; 5 6 7 8 ; 9 10 11 12 ] stored in column-major format as x, by first copying it to a matrix in an NVIDIA GPU d_x using cublasSetMatrix, and then copying d_x to y using cublasGetMatrix().
#include<stdio.h>
#include"cublas_v2.h"
int main()
{
    cublasHandle_t hand;
    float x[][3] = { {1,5,9} , {2,6,10} , {3,7,11} , {4,8,12} };
    float y[4][3] = {};
    float *d_x;
    printf("X\n");
    for( int i=0 ; i<4 ; i++ )
    {
        printf("Row %i:",i+1);
        for( int j = 0 ; j<3 ; j++ )
        {
            printf(" %f",x[i][j]);
        }
        putchar('\n');
    }
    printf("Y\n");
    for( int i=0 ; i<4 ; i++ )
    {
        printf("Row %i:",i+1);
        for( int j = 0 ; j<3 ; j++ )
        {
            printf(" %f",y[i][j]);
        }
        putchar('\n');
    }
    cublasCreate( &hand );
    cudaMalloc( &d_x,sizeof(d_x) );
    cublasSetMatrix( 3,4,sizeof(float),x,3,d_x,3 );
    cublasGetMatrix( 3,4,sizeof(float),d_x,3,y,3 );
    printf("X\n");
    for( int i=0 ; i<4 ; i++ )
    {
        printf("Row %i:",i+1);
        for( int j = 0 ; j<3 ; j++ )
        {
            printf(" %f",x[i][j]);
        }
        putchar('\n');
    }
    printf("Y\n");
    for( int i=0 ; i<4 ; i++ )
    {
        printf("Row %i:",i+1);
        for( int j = 0 ; j<3 ; j++ )
        {
            printf(" %f",y[i][j]);
        }
        putchar('\n');
    }
    cudaFree( d_x );
    cublasDestroy( hand );
    return 0;
}
The output after the copy shows y filled with 0s.
Did any of the cublas function calls fail ?
Or/And
Have the wrong arguments been passed to the cublas functions ?
Also, please explain the purpose of each argument to the functions.
Using GeForce GTX 650 with CUDA 6.5 on Fedora 21 x86_64.
 
     
    