I wrote a function swap to conveniently swap device array pointers, but it is not working, I assume I am swapping local array pointers in the swap function and not the ones I am passing to it.
__global__ void device_add_one(float *A, float *B)
{
    for (int index = blockIdx.x * blockDim.x + threadIdx.x;
         index < N;
         index += blockDim.x * gridDim.x)
    {
        // just for the example
         B[index] = A[index] + 1;
    {
}
void swap(float *a, float *b)
{
    float *temp = a;
    a = b;
    b = temp;
}
void loop(float *host_array, int size, int loops)
{
    cudaMalloc(&A, (size * sizeof(float));
    cudaMalloc(&B, (size * sizeof(float));
    cudaMemcpy(A, host_array, (size * sizeof(float), cudaMemcpyHostToDevice);
    for (int i = 0; i < loops; i++) {
        device_add_one<<< 1, 254 >>>(A, B);
        // swap pointers like this does not work
        swap(A, B);
        /* This works:
        float *temp = a;
        a = b;
        b = temp;
        */
    }
    cudaMemcpy(host_array, A, (size * sizeof(float), cudaMemcpyDeviceToHost);
}
 
    