How do I apply atomic operation for struct on CUDA?

Question

Let the struct is defined as follows:

typedef struct S { 
    float x;
    float y;
} T;

and operation struct_add is defined as follows:

__device__ T struct_add(T a1, T a2) {
    T result;
    result.x = a1.x + a2.x;
    result.y = a1.y + a2.y;
}

If I want to apply struct_add in an atomic manner, how can I implement this in CUDA? For example, a, b, and c needs to be summed up using struct_add, and the result needs to be stored in d. (where the type of a, b, c, and d is T)

I heard that "Lock and Access control" through a while loop is not recommended. Is there any proper way to implement this?

Robert Crovella · Accepted Answer · 2020-10-17T01:14:59.637

There is no general atomic method provided by CUDA that covers arbitrary struct atomic updates. Some possibilities:

Because you specifically want to update two adjacent 32-bit items, you could use a generalized 64-bit atomic operation that would be a variant of what is described here.
Another alternative is the one you already mention, basically implementing a critical section.
Finally, another possible approach may be parallel reduction, although this is not exactly analogous to atomic usage

Along the lines of suggestion 1 above, here is a modification of the code from this answer which may indicate how you can use a 64-bit atomic:

$ cat t56.cu
#include <stdio.h>
#define DSIZE 512
#define nTPB 256

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

typedef union {
  float floats[2];
  unsigned long long int ulong;    // for atomic update
} my_atomics;

__device__ my_atomics test;

__device__ unsigned long long int my_atomicAdd_2floats(unsigned long long int* address, float val0, float val1)
{
    my_atomics loctest;
    unsigned long long old = *address;
    do {
      loctest.ulong = old;
      my_atomics loc;
      loc.floats[0] = val0 + loctest.floats[0];
      loc.floats[1] = val1 + loctest.floats[1];
      old = atomicCAS(address, loctest.ulong,  loc.ulong);}
    while (old != loctest.ulong);
    return old;
}


__global__ void min_test(const float* data)
{

    int idx = (blockDim.x * blockIdx.x) + threadIdx.x;
    if (idx < DSIZE)
      my_atomicAdd_2floats(&(test.ulong), data[idx], (float)idx);
}

int main() {

  float *d_data, *h_data;
  my_atomics my_init;
  my_init.floats[0] = 0.0f;
  my_init.floats[1] = 0.0f;

  h_data = (float *)malloc(DSIZE * sizeof(float));
  if (h_data == 0) {printf("malloc fail\n"); return 1;}
  cudaMalloc((void **)&d_data, DSIZE * sizeof(float));
  cudaCheckErrors("cm1 fail");
  for (int i = 0; i < DSIZE; i++) h_data[i] = 1.0f;
  cudaMemcpy(d_data, h_data, DSIZE*sizeof(float), cudaMemcpyHostToDevice);
  cudaCheckErrors("cmcp1 fail");
  cudaMemcpyToSymbol(test, &(my_init.ulong), sizeof(unsigned long long int));
  cudaCheckErrors("cmcp2 fail");
  min_test<<<(DSIZE+nTPB-1)/nTPB, nTPB>>>(d_data);
  cudaDeviceSynchronize();
  cudaCheckErrors("kernel fail");

  cudaMemcpyFromSymbol(&(my_init.ulong), test, sizeof(unsigned long long int));
  cudaCheckErrors("cmcp3 fail");

  printf("device float0 result = %f\n", my_init.floats[0]);
  printf("device float1 result = %f\n", my_init.floats[1]);

  float host_val0 = 0.0f;
  float host_val1 = 0.0f;
  for (int i=0; i<DSIZE; i++) {
          host_val0 += h_data[i];
          host_val1 += (float)(i);}
  printf("host float0 result = %f\n", host_val0);
  printf("host float1 result = %f\n", host_val1);
  return 0;
}
$ nvcc -arch=sm_35 -o t56 t56.cu -Wno-deprecated-gpu-targets
$ cuda-memcheck ./t56
========= CUDA-MEMCHECK
device float0 result = 512.000000
device float1 result = 130816.000000
host float0 result = 512.000000
host float1 result = 130816.000000
========= ERROR SUMMARY: 0 errors
$

I'm not guaranteeing the above code is defect free. I suggest testing it carefully before using.

How can this be extended if there are floats[4] in the struct? — Square, Oct 17 '20 at 01:32
It can't, because 4 floats would be a total of 128 bits, and the CUDA hardware atomic mechanism stops at 64-bit, currently. For larger structure updates, one of the other 2 suggestions might be what you would want to consider: a parallel reduction scheme (probably best) or a critical section. Or recraft your algorithm so that it doesn't require 4 simultaneous atomic updates. — Robert Crovella, Oct 17 '20 at 01:46
If you're doing `atomicAdd`, it is not obvious to me why you would need a "coupled" atomic operation such as what is discussed here. It seems to me for your `floats[4]` case, you could just do 4 separate `float` `atomicAdd` operations. — Robert Crovella, Oct 17 '20 at 01:59

How do I apply atomic operation for struct on CUDA?

1 Answers1

Linked