How can I modify this code to get 100% load of my GPU?
#include <iostream>
using namespace std;
__global__ void saxpy_parallel(int n, float a, float *x, float *y)
{
    // Get the unique ID of this kernel instance
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n)
    {
        y[i] = a*x[i] + y[i];
    }
}
int main(int argc, char const *argv[])
{
    // Tensors length
    int const n = 100;
    // Define tensors
    float x[n], y[n];
    for (int i = 0; i < n; ++i)
    {
        x[i] = 1.0f*i;
        y[i] = 1.0f*i;
    }
    // Device pointers
    float *d_x, *d_y;
    cudaMalloc(&d_x, n*sizeof(float));
    cudaMalloc(&d_y, n*sizeof(float));
    if (cudaMemcpy(d_x, &x, n*sizeof(float), cudaMemcpyHostToDevice) != cudaSuccess)
    {
        printf("Memory Error!\n");
        return 0;
    }
    if (cudaMemcpy(d_y, &y, n*sizeof(float), cudaMemcpyHostToDevice) != cudaSuccess)
    {
        printf("Memory Error!\n");
        return 0;
    }
    // Run the kernel
    saxpy_parallel<<<4096, 512>>>(n, 2.0, d_x, d_y);
    // Retrieve results from the device memory
    cudaMemcpy(&y, d_y, n*sizeof(float), cudaMemcpyDeviceToHost);
    cudaFree(d_y);
    cudaFree(d_x);
    printf("%s\n",y[0]);
    system("PAUSE");
    return 0;
}
 
     
    