Context
When implementing generic functions in C that work with a range of types, void* is regularly used. The libc function qsort() is a classic example. Internally qsort() and many other algorithms require a swap() function.
A simple but typical implementation of a generic swap looks like this:
void swap(void* x, void* y, size_t size) {
    char t[size];
    memcpy(t, x, size);
    memcpy(x, y, size);
    memcpy(y, t, size);
}
For larger types a byte-by-byte swap can be used, or malloc which would be slow, but the focus here is on what happens when this generic swap() is used for small types.
A better generic swap?
It turns out that if we match for some common type sizes (int and long for 4 and 8 bytes on x86_64) which also cover float, double, pointer etc, that we can get a surprising performance improvement:
void swap(void* x, void* y, size_t size) {
  if (size == sizeof(int)) {
    int t      = *((int*)x);
    *((int*)x) = *((int*)y);
    *((int*)y) = t;
  } else if (size == sizeof(long)) {
    long t      = *((long*)x);
    *((long*)x) = *((long*)y);
    *((long*)y) = t;
  } else {
    char t[size];
    memcpy(t, x, size);
    memcpy(x, y, size);
    memcpy(y, t, size);
  }
}
NB: This could clearly be refined to using #if rather than if/else and for more types.
In the context of the below generic quicksort() implementation, the above swap gives ~2x performance improvement for a 10,000,000 random int sort, compared with the more standard memcpy() only swap at top. This is using gcc-9 or clang-10 on ubuntu 20.04 with -O3.
Question
This seems like a remarkable result.
- Does this break any standards?
- Can anyone validate this?
- What makes this gain possible? Is it simply copying in "wider words" or is the some compiler optimization / inlining at play?
- If it is indeed valid, why is it not being done already? Or is it?
NB: I have not examined the generated assembly code - yet.
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
typedef bool (*cmp)(const void*, const void*);
bool cmp_ints_less(const void* a, const void* b) {
  return *(const int*)a < *(const int*)b;
}
bool cmp_ints_greater(const void* a, const void* b) {
  return *(const int*)a > *(const int*)b;
}
bool cmp_floats_less(const void* a, const void* b) {
  return *(const float*)a < *(const float*)b;
}
bool cmp_floats_greater(const void* a, const void* b) {
  return *(const float*)a > *(const float*)b;
}
bool cmp_doubles_less(const void* a, const void* b) {
  return *(const double*)a < *(const double*)b;
}
bool cmp_doubles_greater(const void* a, const void* b) {
  return *(const double*)a > *(const double*)b;
}
bool cmp_strs_less(const void* a, const void* b) {
  return strcmp(*((const char**)a), *((const char**)b)) < 0;
}
bool cmp_strs_greater(const void* a, const void* b) {
  return strcmp(*((const char**)a), *((const char**)b)) > 0;
}
void swap(void* x, void* y, size_t size) {
  if (size == sizeof(int)) {
    int t      = *((int*)x);
    *((int*)x) = *((int*)y);
    *((int*)y) = t;
  } else if (size == sizeof(long)) {
    long t      = *((long*)x);
    *((long*)x) = *((long*)y);
    *((long*)y) = t;
  } else {
    char t[size];
    memcpy(t, x, size);
    memcpy(x, y, size);
    memcpy(y, t, size);
  }
}
void* partition(void* start, void* end, size_t size, cmp predicate) {
  if (start == NULL || end == NULL || start == end) return start;
  char* storage = (char*)start;
  char* last    = (char*)end - size; // used as pivot
  for (char* current = start; current != last; current += size) {
    if (predicate(current, last)) {
      swap(current, storage, size);
      storage += size;
    }
  }
  swap(storage, last, size);
  return storage; // returns position of pivot
}
void quicksort(void* start, void* end, size_t size, cmp predicate) {
  if (start == end) return;
  void* middle = partition(start, end, size, predicate);
  quicksort(start, middle, size, predicate);
  quicksort((char*)middle + size, end, size, predicate);
}
void print(const int* start, int size) {
  for (int i = 0; i < size; ++i) printf("%3d", start[i]);
  printf("\n");
}
void rand_seed() {
  int   seed = 0;
  FILE* fp   = fopen("/dev/urandom", "re");
  if (!fp) {
    fprintf(stderr, "Warning: couldn't open source of randomness, falling back to time(NULL)");
    srand(time(NULL));
    return;
  }
  if (fread(&seed, sizeof(int), 1, fp) < 1) {
    fprintf(stderr, "Warning: couldn't read random seed, falling back to time(NULL)");
    fclose(fp);
    srand(time(NULL));
    return;
  }
  fclose(fp);
  srand(seed); // nice seed for rand()
}
int rand_range(int start, int end) {
  return start + rand() / (RAND_MAX / (end - start + 1) + 1);
}
int main() {
  // int demo
  rand_seed();
#define int_count 20
  int* ints = malloc(int_count * sizeof(int));
  if (!ints) {
    fprintf(stderr, "couldn't allocate memory");
    exit(EXIT_FAILURE);
  }
  for (int i = 0; i < int_count; ++i) ints[i] = rand_range(1, int_count / 2);
  print(ints, int_count);
  quicksort(ints, ints + int_count, sizeof(int), &cmp_ints_less);
  print(ints, int_count);
  free(ints);
  // string demo
  const char* strings[] = {
      "material", "rare",    "fade",      "aloof",  "way",  "torpid",
      "men",      "purring", "abhorrent", "unpack", "zinc", "unsightly",
  };
  const int str_count = sizeof(strings) / sizeof(strings[0]);
  quicksort(strings, strings + str_count, sizeof(char*), &cmp_strs_greater);
  for (int i = 0; i < str_count; ++i) printf("%s\n", strings[i]);
// double demo
#define dbl_count 20
  double doubles[dbl_count];
  for (int i = 0; i < dbl_count; ++i) doubles[i] = rand() / (RAND_MAX / 100.0);
  quicksort(doubles, doubles + dbl_count, sizeof(char*), &cmp_doubles_less);
  for (int i = 0; i < dbl_count; ++i) printf("%20.16f\n", doubles[i]);
  return EXIT_SUCCESS;
}
EDIT:
Just for reference Compiler Explorer reports the very obvious following assembly for alternative generic swap():
The sample main() there is:
int main() {
  int two = 2;
  int three = 3;
  swap(&two, &three, sizeof(int));
  swap2(&two, &three, sizeof(int));
  return two - three;
}
full assembler for swap2() below, but it is noteworthy that the compiler has inlined swap2() but not swap() which of course containes further calls to memcopy. This might be some (all?) of the difference?
swap2:
        push    rbp
        mov     rbp, rsp
        push    r14
        mov     r14, rdi
        push    r13
        mov     r13, rsi
        push    r12
        push    rbx
        cmp     rdx, 4
        je      .L9
        mov     r12, rdx
        cmp     rdx, 8
        jne     .L7
        mov     rax, QWORD PTR [rdi]
        mov     rdx, QWORD PTR [rsi]
        mov     QWORD PTR [rdi], rdx
        mov     QWORD PTR [rsi], rax
        lea     rsp, [rbp-32]
        pop     rbx
        pop     r12
        pop     r13
        pop     r14
        pop     rbp
        ret
.L7:
        lea     rax, [rdx+15]
        mov     rbx, rsp
        mov     rsi, rdi
        and     rax, -16
        sub     rsp, rax
        mov     rdi, rsp
        call    memcpy
        mov     rdx, r12
        mov     rsi, r13
        mov     rdi, r14
        call    memcpy
        mov     rdx, r12
        mov     rsi, rsp
        mov     rdi, r13
        call    memcpy
        mov     rsp, rbx
        lea     rsp, [rbp-32]
        pop     rbx
        pop     r12
        pop     r13
        pop     r14
        pop     rbp
        ret
.L9:
        mov     eax, DWORD PTR [rdi]
        mov     edx, DWORD PTR [rsi]
        mov     DWORD PTR [rdi], edx
        mov     DWORD PTR [rsi], eax
        lea     rsp, [rbp-32]
        pop     rbx
        pop     r12
        pop     r13
        pop     r14
        pop     rbp
        ret
 
    