I am trying to allocate device memory, copy to it, perform the calculations on the GPU, copy the results back and then free up the device memory I allocated. I wanted to make sure that I wasn't going over the limit and I wanted to see if I would have enough memory in the shared memory space to dump a few arrays.
When I allocate device memory, there are no errors being returned. When I use cudaMemGetInfo to check the amount of memory allocated, it looks like one cudaMalloc hasn't allocated any memory. 
Also when I try to free the memory, it looks like only one pointer is freed.
I am using the matlab Mexfunction interface to setup the GPU memory and launch the kernel. At this point, I'm not even calling into the kernel and just returning back a unit matrix for the results.
cudaError_t cudaErr;
size_t freeMem = 0;
size_t totalMem = 0;
size_t allocMem = 0;
cudaMemGetInfo(&freeMem, &totalMem);  
mexPrintf("Memory avaliable: Free: %lu, Total: %lu\n",freeMem, totalMem);  
/* Pointers for the device memory */
double *devicePulseDelay, *deviceTarDistance, *deviceScattDistance, *deviceScatterers;
double *deviceReceivedReal, *deviceReceivedImag;
/* Allocate memory on the device for the arrays. */
mexPrintf("Allocating memory.\n");
cudaErr = cudaMalloc( (void **) &devicePulseDelay, sizeof(double)*512);
if (cudaErr != cudaSuccess)
{
    mexPrintf("could not allocate memory to devicePulseDelay\n");
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("devicePulseDelay: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMalloc( (void **) &deviceTarDistance, sizeof(double)*512);
if (cudaErr != cudaSuccess)
{
    mexPrintf("could not allocate memory to deviceTarDistance\n");
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceTarDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMalloc( (void **) &deviceScattDistance, sizeof(double)*999*512);
if (cudaErr != cudaSuccess)
{
    mexPrintf("could not allocate memory to deviceScattDistance\n");
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceScattDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMalloc( (void **) &deviceScatterers, sizeof(double)*999);
if (cudaErr != cudaSuccess)
{   
    mexPrintf("could not allocate memory to deviceScatterers\n");
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}  
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceScatterers: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMalloc( (void **) &deviceReceivedReal, sizeof(double)*999*512);
if (cudaErr != cudaSuccess)
{
    mexPrintf("could not allocate memory to deviceReceivedReal\n");
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceReceivedReal: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMalloc( (void **) &deviceReceivedImag, sizeof(double)*999*512);
if (cudaErr != cudaSuccess)
{
    mexPrintf("could not allocate memory to deviceReceivedImag\n");   
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceReceivedImag: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n", allocMem, totalMem,(freeMem - allocMem));
/* copy the input arrays across to the device */
mexPrintf("\nCopying memory.\n");
cudaErr = cudaMemcpy(devicePulseDelay, pulseDelay, sizeof(double)*512,cudaMemcpyHostToDevice);
if (cudaErr != cudaSuccess) 
{
    mexPrintf("could not copy to devicePulseDelay\n");
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("devicePulseDelay: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMemcpy(deviceTarDistance, tarDistance, sizeof(double)*512,cudaMemcpyHostToDevice);
if (cudaErr != cudaSuccess) 
{
    mexPrintf("could not copy to deviceTarDistance\n");
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));   
}   
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceTarDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMemcpy(deviceScattDistance, scattDistance, sizeof(double)*999*512,cudaMemcpyHostToDevice);   
if (cudaErr != cudaSuccess)
{  
    mexPrintf("could not copy to deviceScattDistance\n");  
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));  
} 
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceScattDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMemcpy(deviceScatterers, scatterers, sizeof(double)*999,cudaMemcpyHostToDevice); 
if (cudaErr != cudaSuccess) 
{
    mexPrintf("could not copy to deviceScatterers\n");
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));   
}   
cudaMemGetInfo(&allocMem, &totalMem);  
mexPrintf("deviceScatterers: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));  
/* call the kernel */
// launchKernel<<<1,512>>>(........);   
/* retireve the output */  
cudaErr = cudaMemcpy(receivedReal, deviceReceivedReal, sizeof(double)*512*512,cudaMemcpyDeviceToHost);   
if (cudaErr != cudaSuccess)
{   
    mexPrintf("could not copy to receivedReal\n");  
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));  
}
cudaMemGetInfo(&allocMem, &totalMem);   
mexPrintf("receivedReal: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMemcpy(receivedImag, deviceReceivedImag, sizeof(double)*512*512,cudaMemcpyDeviceToHost); 
if (cudaErr != cudaSuccess)
{ 
    mexPrintf("could not copy to receivedImag\n");   
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));   
}   
cudaMemGetInfo(&allocMem, &totalMem); 
mexPrintf("receivedImag: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));   
/* free the memory. */ 
mexPrintf("\nFree'ing memory.\n");   
cudaMemGetInfo(&freeMem, &totalMem);  
mexPrintf("Before freeing: Free %lu, Total: %lu\n", freeMem, totalMem);  
cudaErr = cudaFree(devicePulseDelay); 
if (cudaErr != cudaSuccess) 
{ 
    mexPrintf("could free devicePulseDelay\n");   
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));  
}   
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("devicePulseDelay: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));   
cudaErr = cudaFree(deviceTarDistance);   
if (cudaErr != cudaSuccess) 
{
    mexPrintf("could free deviceTarDistance\n");  
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));  
} 
cudaMemGetInfo(&allocMem, &totalMem);   
mexPrintf("deviceTarDistance: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));  
cudaErr = cudaFree(deviceScattDistance);   
if (cudaErr != cudaSuccess) 
{   
    mexPrintf("could free deviceScattDistance\n"); 
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}   
cudaMemGetInfo(&allocMem, &totalMem);   
mexPrintf("deviceScattDistance: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));  
cudaErr = cudaFree(deviceScatterers);  
if (cudaErr != cudaSuccess) 
{   
    mexPrintf("could free deviceScatterers\n");  
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));   
}   
cudaMemGetInfo(&allocMem, &totalMem);  
mexPrintf("deviceScatterers: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));  
cudaErr = cudaFree(deviceReceivedReal);  
if (cudaErr != cudaSuccess) 
{  
    mexPrintf("could free deviceReceivedReal\n"); 
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));  
} 
cudaMemGetInfo(&allocMem, &totalMem);  
mexPrintf("deviceReceivedReal: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));   
cudaErr = cudaFree(deviceReceivedImag);   
if (cudaErr != cudaSuccess) 
{ 
    mexPrintf("could free deviceReceivedImag\n");
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));  
}   
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceReceivedImag: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));
Here is the output from this:
Memory avaliable: Free: 2523959296, Total: 2818572288 Allocating memory. devicePulseDelay: Memory avaliable: Free: 2522910720, Total: 2818572288, Consumed: 1048576 deviceTarDistance: Memory avaliable: Free: 2522910720, Total: 2818572288, Consumed: 1048576 deviceScattDistance: Memory avaliable: Free: 2518716416, Total: 2818572288, Consumed: 5242880 deviceScatterers: Memory avaliable: Free: 2517667840, Total: 2818572288, Consumed: 6291456 deviceReceivedReal: Memory avaliable: Free: 2515570688, Total: 2818572288, Consumed: 8388608 deviceReceivedImag: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 Copying memory. devicePulseDelay: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 deviceTarDistance: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 deviceScattDistance: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 deviceScatterers: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 receivedReal: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 receivedImag: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 Free'ing memory. Before freeing: Free 2513473536, Total: 2818572288 devicePulseDelay: Memory avaliable: Free: 2513473536, Total: 2818572288, Free'd: 0 deviceTarDistance: Memory avaliable: Free: 2513473536, Total: 2818572288, Free'd: 0 deviceScattDistance: Memory avaliable: Free: 2513473536, Total: 2818572288, Free'd: 0 deviceScatterers: Memory avaliable: Free: 2514522112, Total: 2818572288, Free'd: 1048576 deviceReceivedReal: Memory avaliable: Free: 2514522112, Total: 2818572288, Free'd: 1048576 deviceReceivedImag: Memory avaliable: Free: 2514522112, Total: 2818572288, Free'd: 1048576
I feel like there is something obvious that i'm missing. Can anyone help explain what is going on?
EDIT: platform is windows 7 with a Tesla C2050 GPu card.
 
     
    