I have to program the Floyd algorithm using OpenCL, it works fine but only with n<268. when n>=268 i have an "Access violation reading location" when calling clEnqueueWriteBuffer (the buffer_distances one, in the loop).
Here is my code:
graphe is an adjacency matrix, and distances is the distances matrix
    int n;
    printf("enter n value: ");
    scanf("%d", &n);
    printf("\n");
    int n2 = n * n;
    int matSize = n2 * sizeof(int*);
    int* graphe = malloc(sizeof(int) * n2);
    int* distances = malloc(sizeof(int) * n2);
    //mat[i,j] => mat[i*n + j]
    if (graphe == NULL)
        printf("malloc failed\n");
    init_graphe(graphe, n);
    copy(graphe, distances, n);
initialization of opencl variables:
    char* programSource = load_kernel("kernel.cl");
    cl_int status;
    // STEP 1: Discover and initialize the platforms
    cl_uint numPlatforms = 0;
    cl_platform_id* platforms = NULL;
    status = clGetPlatformIDs(0, NULL, &numPlatforms);
    printf("Number of platforms = %d\n", numPlatforms);
    platforms = (cl_platform_id*)malloc(numPlatforms * sizeof(cl_platform_id));
    status = clGetPlatformIDs(numPlatforms, platforms, NULL);
    char Name[1000];
    clGetPlatformInfo(platforms[0], CL_PLATFORM_NAME, sizeof(Name), Name, NULL);
    printf("Name of platform : %s\n", Name);
    fflush(stdout);
    // STEP 2: Discover and initialize the devices
    cl_uint numDevices = 0;
    cl_device_id* devices = NULL;
    status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices);
    printf("Number of devices = %d\n", (int)numDevices);
    devices = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id));
    status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL);
    for (int i = 0; i < numDevices; i++) {
        clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(Name), Name, NULL);
        printf("Name of device %d: %s\n\n", i, Name);
    }
    // STEP 3: Create a context
    fflush(stdout);
    cl_context context = NULL;
    context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &status);
    // STEP 4: Create a command queue
    fflush(stdout);
    cl_command_queue cmdQueue;
    cmdQueue = clCreateCommandQueue(context, devices[0], 0, &status);
    // STEP 5: Create device buffers
    fflush(stdout);
    cl_mem buffer_graphe;
    cl_mem buffer_n;
    cl_mem buffer_distances;
    cl_mem buffer_k;
    buffer_graphe = clCreateBuffer(context, CL_MEM_READ_WRITE, matSize, NULL, &status);
    buffer_n = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int), NULL, &status);
    buffer_distances = clCreateBuffer(context, CL_MEM_READ_WRITE, matSize, NULL, &status);
    buffer_k = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int), NULL, &status);
    fflush(stdout);
    // STEP 6: Create and compile the program
    cl_program program = clCreateProgramWithSource(context, 1, (const char**)&programSource, NULL, &status);
    printf("Compilation\n");
    fflush(stdout);
    status = clBuildProgram(program, numDevices, devices, NULL, NULL, NULL);
    // STEP 8: Create the kernel
    cl_kernel kernel = NULL;
    fflush(stdout);
    kernel = clCreateKernel(program, "floyd", &status);
    size_t globalWorkSize[2] = { n, n };
    size_t localWorkSize[3] = { 20,20 };
Execution of the kernel:
    clock_t start = clock();
    int k;
    for (k = 0; k < n; k++) {
        status = clEnqueueWriteBuffer(cmdQueue, buffer_graphe, CL_TRUE, 0, matSize, graphe, 0, NULL, NULL);
        status = clEnqueueWriteBuffer(cmdQueue, buffer_n, CL_TRUE, 0, sizeof(int), &n, 0, NULL, NULL);
        status = clEnqueueWriteBuffer(cmdQueue, buffer_distances, CL_TRUE, 0, matSize, distances, 0, NULL, NULL);
        status = clEnqueueWriteBuffer(cmdQueue, buffer_k, CL_TRUE, 0, sizeof(int), &k, 0, NULL, NULL);
        status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&buffer_graphe);
        status = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&buffer_n);
        status = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&buffer_distances);
        status = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&buffer_k);
        status = clEnqueueNDRangeKernel(cmdQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
        clFinish(cmdQueue);  
        status = clEnqueueReadBuffer(cmdQueue, buffer_distances, CL_TRUE, 0, matSize, distances, 0, NULL, NULL);
        clFinish(cmdQueue);
    }
and the kernel:
void kernel floyd(global int* graphe, global int* n, global int* distances, global int* k)
{
    int i = get_global_id(0);
    int j = get_global_id(1);
    int ij = i * (*n) + j;
    int ik = i * (*n) + (*k);
    int kj = (*k) * (*n) + j;
    if (distances[ik] + distances[kj] < distances[ij]) {
        distances[ij] = distances[ik] + distances[kj];
    }
}
 
    