#include <iostream>
#include <vector>
#include <iomanip>
#include <cuda_runtime.h>
#define CHECK_CUDA(cond) check_cuda(cond, __LINE__)
void check_cuda(cudaError_t status, std::size_t line)
{
    if(status != cudaSuccess)
    {
        std::cout << cudaGetErrorString(status) << '\n';
        std::cout << "Line: " << line << '\n';
        throw 0;
    }
}
__global__ void copy_kernel(float* __restrict__ output, const float* __restrict__ input, int N)
{
    for (int i = blockIdx.x * blockDim.x + threadIdx.x;  i < N; i += blockDim.x * gridDim.x) 
        output[i] = input[i];
}
int main()
{
    constexpr int num_trials = 100;
    std::vector<int> test_sizes = { 100'000, 1'000'000, 10'000'000, 100'000'000, 250'000'000 };
    int grid_size = 0, block_size = 0;
    CHECK_CUDA(cudaOccupancyMaxPotentialBlockSize(&grid_size, &block_size, copy_kernel, 0));
    std::cout << std::fixed << std::setprecision(4) << std::endl;
    for (auto sz : test_sizes)
    {
        std::cout << "Test Size: " << sz << '\n';
        float *d_vector_src = nullptr, *d_vector_dest = nullptr;
        CHECK_CUDA(cudaMalloc(&d_vector_src, sz * sizeof(float)));
        CHECK_CUDA(cudaMalloc(&d_vector_dest, sz * sizeof(float)));
        cudaEvent_t start, stop;
        CHECK_CUDA(cudaEventCreate(&start));
        CHECK_CUDA(cudaEventCreate(&stop));
        float accumulate = 0.0;
        for (int i = 0; i < num_trials; i++)
        {
            CHECK_CUDA(cudaEventRecord(start));
            copy_kernel<<<grid_size, block_size>>>(d_vector_dest, d_vector_src, sz);
            CHECK_CUDA(cudaEventRecord(stop));
            CHECK_CUDA(cudaEventSynchronize(stop));
            float current_time = 0;
            CHECK_CUDA(cudaEventElapsedTime(¤t_time, start, stop));
            accumulate += current_time;
        }
        std::cout << "\tKernel Copy Time: " << accumulate / num_trials << "ms\n";
        accumulate = 0.0;
        for (int i = 0; i < num_trials; i++)
        {
            CHECK_CUDA(cudaEventRecord(start));
            CHECK_CUDA(cudaMemcpy(d_vector_dest, d_vector_src, sz * sizeof(float), cudaMemcpyDeviceToDevice));
            CHECK_CUDA(cudaEventRecord(stop));
            CHECK_CUDA(cudaEventSynchronize(stop));
            float current_time = 0;
            CHECK_CUDA(cudaEventElapsedTime(¤t_time, start, stop));
            accumulate += current_time;
        }
        std::cout << "\tMemcpy Time: " << accumulate / num_trials << "ms\n";
        CHECK_CUDA(cudaFree(d_vector_src));
        CHECK_CUDA(cudaFree(d_vector_dest));
    }
    return 0;
}
GTX 1050 Mobile
Test Size: 100000
        Kernel Copy Time: 0.0118ms
        Memcpy Time: 0.0127ms
Test Size: 1000000
        Kernel Copy Time: 0.0891ms
        Memcpy Time: 0.0899ms
Test Size: 10000000
        Kernel Copy Time: 0.8697ms
        Memcpy Time: 0.8261ms
Test Size: 100000000
        Kernel Copy Time: 8.8871ms
        Memcpy Time: 8.2401ms
Test Size: 250000000
        Kernel Copy Time: 22.3060ms
        Memcpy Time: 20.5419ms
GTX 1080 Ti
Test Size: 100000
    Kernel Copy Time: 0.0166ms
    Memcpy Time: 0.0188ms
Test Size: 1000000
    Kernel Copy Time: 0.0580ms
    Memcpy Time: 0.0727ms
Test Size: 10000000
    Kernel Copy Time: 0.4674ms
    Memcpy Time: 0.5047ms
Test Size: 100000000
    Kernel Copy Time: 4.7992ms
    Memcpy Time: 3.7722ms
Test Size: 250000000
    Kernel Copy Time: 7.2485ms
    Memcpy Time: 5.5863ms
Test Size: 1000000000
    Kernel Copy Time: 31.5570ms
    Memcpy Time: 22.3184ms
RTX 2080 Ti
Test Size: 100000
    Kernel Copy Time: 0.0048ms
    Memcpy Time: 0.0054ms
Test Size: 1000000
    Kernel Copy Time: 0.0193ms
    Memcpy Time: 0.0220ms
Test Size: 10000000
    Kernel Copy Time: 0.1578ms
    Memcpy Time: 0.1537ms
Test Size: 100000000
    Kernel Copy Time: 2.1156ms
    Memcpy Time: 1.5006ms
Test Size: 250000000
    Kernel Copy Time: 5.5195ms
    Memcpy Time: 3.7424ms
Test Size: 1000000000
    Kernel Copy Time: 23.2106ms
    Memcpy Time: 14.9483ms