In the following code I am simply calling a function foo twice serially from main. The function simply does device memory allocation , and then increments this pointer. Then it exits and goes back to main.
First time foo is called memory is correctly allocated. But now as you can see in output when I call foo again, cuda memory allocation is failing with an error invalid device pointer
I tried using cudaThreadSynchronize() between two foo calls, but no gain. Why memory allocation failing ?
Actually the error is casued due to
matrixd += 3;
Because if I don't do this increment the error disappeared.
But why , even though I am using cudaFree() ?
Kindly help me understand this.
My Output is here
Calling foo for the first time
Allocation of matrixd passed:
I came back to main safely :-)
I am going back to foo again :-)
Allocation of matrixd failed, the reason is:  invalid device pointer
My main() is here
#include<stdio.h>  
#include <cstdlib> // malloc(), free() 
#include <iostream> // cout, stream
#include <math.h>
#include <ctime> // time(), clock()
#include <bitset>
bool foo(  );
/***************************************
Main method.
****************************************/
 int main()  
 { 
    // Perform one warm-up pass and validate
    std::cout << "Calling foo for the first time"<<std::endl;
    foo();
    std::cout << "I came back to main safely :-) "<<std::endl;
    std::cout << "I am going back to foo again :-) "<<std::endl;
    foo( );    
    getchar();  
    return 0;  
 }  
Definition of foo() is in this file :
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <device_launch_parameters.h>
#include <iostream>
bool foo( )
{
    // Error return value
    cudaError_t status;
    // Number of bytes in the matrix.
    int bytes = 9 *sizeof(float);
        // Pointers to the device arrays
    float *matrixd=NULL; 
    // Allocate memory on the device to store matrix
    cudaMalloc((void**) &matrixd, bytes);
    status = cudaGetLastError();              //To check the error
    if (status != cudaSuccess) {                     
        std::cout << "Allocation of matrixd failed, the reason is:  " <<    cudaGetErrorString(status) << 
        std::endl;
        cudaFree(matrixd);                     //Free call for memory
        return false;
    }
    std::cout << "Allocation of matrixd passed: "<<std::endl;
    ////// Increment address 
    for (int i=0; i<3; i++){
         matrixd += 3;
    }
        // Free device memory
    cudaFree(matrixd);     
    return true;
}
Update
With better error checking. Also I am incrementalism the device pointer only once. This time I get following output:
Calling foo for the first time
Allocation of matrixd passed:
Increamented the pointer and going to free cuda memory:
GPUassert: invalid device pointer C:/Users/user/Desktop/Gauss/Gauss/GaussianElem
inationGPU.cu 44
Line number 44 is cudaFree(). Why it still failing?
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}
// GPU function for direct method Gross Jorden method.
bool foo( )
{
    // Error return value
    cudaError_t status;
    // Number of bytes in the matrix.
    int bytes = 9 *sizeof(float);
        // Pointers to the device arrays
    float *matrixd=NULL; 
    // Allocate memory on the device to store each matrix
    gpuErrchk( cudaMalloc((void**) &matrixd, bytes));
    //cudaMemset(outputMatrixd, 0, bytes);
    std::cout << "Allocation of matrixd passed: "<<std::endl;
    ////// Incerament address 
         matrixd += 1;
         std::cout << "Increamented the pointer and going to free cuda memory: "<<std::endl;
         // Free device memory
    gpuErrchk( cudaFree(matrixd));     
    return true;
}
 
     
    