I'm trying to use the CUDA Driver API to copy data into a 2D array, in the program listed below, but am getting an "invalid value" error when I pass my copy parameters. What value in them is wrong?
#include <cuda.h>
#include <iostream>
#include <iomanip>
#include <numeric>
#include <limits>
#include <cstring>
[[noreturn]] void die_(const std::string& message) {
    std::cerr << message << "\n";
    exit(EXIT_FAILURE);
}
void die_if_error(CUresult status, const std::string& extra_message) {
    if (status != CUDA_SUCCESS) {
        const char* error_string;
        cuGetErrorString(status, &error_string);
        die_(extra_message + ": " + error_string);
    }
}
template <typename T = void>
T* as_pointer(CUdeviceptr address) noexcept { return reinterpret_cast<T*>(address); }
CUdeviceptr as_address(void* ptr) noexcept { return reinterpret_cast<CUdeviceptr>(ptr); }
int main() {
    CUresult status;
    int device_id = 0;
    status = cuInit(0);
    die_if_error(status, "Initializing the CUDA driver");
    CUcontext pctx;
    status = cuDevicePrimaryCtxRetain(&pctx, device_id);
    die_if_error(status, "Obtaining the primary device context");
    cuCtxSetCurrent(pctx);
    struct { unsigned width, height; } dims = { 3, 3 };
    std::cout << "Creating a " << dims.width << " x " << dims.height << " CUDA array" << std::endl;
    CUarray arr_handle;
    {
        CUDA_ARRAY_DESCRIPTOR array_descriptor;
        array_descriptor.Width = dims.width;
        array_descriptor.Height = dims.height;
        array_descriptor.Format = CU_AD_FORMAT_FLOAT;
        array_descriptor.NumChannels = 1;
        status = cuArrayCreate(&arr_handle, &array_descriptor);
        die_if_error(status, "Failed creating a 2D CUDA array");
    }
    auto arr_size = dims.width * dims.height;
    CUdeviceptr dptr;
    status = cuMemAllocManaged(&dptr, arr_size, CU_MEM_ATTACH_GLOBAL);
    die_if_error(status, "Failed allocating managed memory");
    float* ptr_in = as_pointer<float>(dptr);
    std::iota(ptr_in, ptr_in + arr_size, 0);
    CUmemorytype ptr_in_memory_type;
    status = cuPointerGetAttribute(&ptr_in_memory_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, as_address(ptr_in));
    if (not (ptr_in_memory_type == CU_MEMORYTYPE_UNIFIED or ptr_in_memory_type == CU_MEMORYTYPE_DEVICE)) {
        die_("Unexpected memory type for ptr_in");
    }
    std::cout << "The memory type of ptr_in is " << (ptr_in_memory_type == CU_MEMORYTYPE_DEVICE ? "DEVICE" : "UNIFIED") << std::endl;
    std::cout << "Will copy from ptr_in into a 2D CUDA array" << std::endl;
    CUDA_MEMCPY2D cp;
    {
        // Source
        cp.srcXInBytes = 0; cp.srcY = 0; // No offset
        cp.srcMemoryType = ptr_in_memory_type;
        cp.srcDevice = as_address(ptr_in);
        // no extra source pitch
        cp.srcPitch = dims.width * sizeof(float);
        // Destination
        cp.dstXInBytes = 0; cp.dstY = 0; // No destination offset
        cp.dstMemoryType = CU_MEMORYTYPE_ARRAY;
        cp.dstArray = arr_handle;
        cp.WidthInBytes = dims.width * sizeof(float);
        cp.Height = dims.height;
    }
    status = cuMemcpy2D(&cp);
    die_if_error(status, "cuMemcpy2D failed");
    cuMemFree(as_address(ptr_in));
}
Full output of this program:
Creating a 3 x 3 CUDA array
The memory type of ptr_in is DEVICE
Will copy from ptr_in into a 2D CUDA array
cuMemcpy2D failed: invalid argument
Additional information:
- CUDA toolkit version: 11.4
 - NVIDIA driver version: 470.57.02
 - OS distribution: Devuan Chimaera GNU/Linux
 - GPU: GeForce 1050 TI Boost (Compute Capability 6.1)
 - Host architecture: amd64