While my program compiles with no errors, I run my program, and my initial C statements work, but when I hit my first CUDA function cudaMallocHost() the program just stops doing anything. I get no errors, and I have to use cntr-c to manually exit. I tried removing various parts, and it seems that whenever I hit any of my CUDA functions, it has the same effect. 
I included cuda.h and cuda_runtime.h, and am compiling a .cu file with nvcc with the NVIDIA SDK and CUDA 5.5 installed. 
int main(){
    int set[6][6] = {{1,2,3,4,5,6}, {7,8,9,10,11,12}, {13,14,15,16,17,18}, {19,20,21,22,23,24}, {25,26,27,28,29,30}, {31,32,33,34,35,36}};
    int *i=0, *d_d=0, a, b, nbytes;
    int size = sizeof(set[0]);
    nbytes = size*size*sizeof(int);
    for(a=0;a<6;a++){
        for(b=0;b<6;b++){
            printf("%d, ", set[a][b]);
        }
        printf("\n");
    }
/*end safezone*/
    cudaMallocHost((void**)&set, nbytes);
    memset(set,0,nbytes);
    printf("CPUmem");
    cudaMalloc((void**)&d_d, nbytes);
    cudaMemset(set,0,nbytes);
    printf("GPUmem");
    cudaMemcpy(d_d, set, nbytes, cudaMemcpyHostToDevice);
    dim3 threads = dim3(((size % 512) + 1), 1);
    dim3 blocks = dim3((threads.x / 512) + 1, 1);
    printf("Copied & DIM setup");
    matrixflop<<<blocks, threads>>>(set);
    printf("Threads created");
    cudaMemcpy(set, d_d, nbytes, cudaMemcpyDeviceToHost);
/*start safezone*/
    for(a=0;a<6;a++){
        for(b=0;b<6;b++){
            printf("%d, ", set[a][b]);
        }
        printf("\n");
    }
/*end safezone*/
    cudaFreeHost(i);
    cudaFree(d_d);
    printf("Success");
    getchar();
    exit(EXIT_SUCCESS);
    return 0;
}
Edited to include code.
 
    