Improving performance of generic swap

Question

Context

When implementing generic functions in C that work with a range of types, void* is regularly used. The libc function qsort() is a classic example. Internally qsort() and many other algorithms require a swap() function.

A simple but typical implementation of a generic swap looks like this:

void swap(void* x, void* y, size_t size) {
    char t[size];
    memcpy(t, x, size);
    memcpy(x, y, size);
    memcpy(y, t, size);
}

For larger types a byte-by-byte swap can be used, or malloc which would be slow, but the focus here is on what happens when this generic swap() is used for small types.

A better generic swap?

It turns out that if we match for some common type sizes (int and long for 4 and 8 bytes on x86_64) which also cover float, double, pointer etc, that we can get a surprising performance improvement:

void swap(void* x, void* y, size_t size) {
  if (size == sizeof(int)) {
    int t      = *((int*)x);
    *((int*)x) = *((int*)y);
    *((int*)y) = t;
  } else if (size == sizeof(long)) {
    long t      = *((long*)x);
    *((long*)x) = *((long*)y);
    *((long*)y) = t;
  } else {
    char t[size];
    memcpy(t, x, size);
    memcpy(x, y, size);
    memcpy(y, t, size);
  }
}

NB: This could clearly be refined to using #if rather than if/else and for more types.

In the context of the below generic quicksort() implementation, the above swap gives ~2x performance improvement for a 10,000,000 random int sort, compared with the more standard memcpy() only swap at top. This is using gcc-9 or clang-10 on ubuntu 20.04 with -O3.

Question

This seems like a remarkable result.

Does this break any standards?
Can anyone validate this?
What makes this gain possible? Is it simply copying in "wider words" or is the some compiler optimization / inlining at play?
If it is indeed valid, why is it not being done already? Or is it?

NB: I have not examined the generated assembly code - yet.

#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>

typedef bool (*cmp)(const void*, const void*);

bool cmp_ints_less(const void* a, const void* b) {
  return *(const int*)a < *(const int*)b;
}

bool cmp_ints_greater(const void* a, const void* b) {
  return *(const int*)a > *(const int*)b;
}

bool cmp_floats_less(const void* a, const void* b) {
  return *(const float*)a < *(const float*)b;
}

bool cmp_floats_greater(const void* a, const void* b) {
  return *(const float*)a > *(const float*)b;
}

bool cmp_doubles_less(const void* a, const void* b) {
  return *(const double*)a < *(const double*)b;
}

bool cmp_doubles_greater(const void* a, const void* b) {
  return *(const double*)a > *(const double*)b;
}

bool cmp_strs_less(const void* a, const void* b) {
  return strcmp(*((const char**)a), *((const char**)b)) < 0;
}

bool cmp_strs_greater(const void* a, const void* b) {
  return strcmp(*((const char**)a), *((const char**)b)) > 0;
}

void swap(void* x, void* y, size_t size) {
  if (size == sizeof(int)) {
    int t      = *((int*)x);
    *((int*)x) = *((int*)y);
    *((int*)y) = t;
  } else if (size == sizeof(long)) {
    long t      = *((long*)x);
    *((long*)x) = *((long*)y);
    *((long*)y) = t;
  } else {
    char t[size];
    memcpy(t, x, size);
    memcpy(x, y, size);
    memcpy(y, t, size);
  }
}

void* partition(void* start, void* end, size_t size, cmp predicate) {
  if (start == NULL || end == NULL || start == end) return start;
  char* storage = (char*)start;
  char* last    = (char*)end - size; // used as pivot
  for (char* current = start; current != last; current += size) {
    if (predicate(current, last)) {
      swap(current, storage, size);
      storage += size;
    }
  }
  swap(storage, last, size);
  return storage; // returns position of pivot
}

void quicksort(void* start, void* end, size_t size, cmp predicate) {
  if (start == end) return;
  void* middle = partition(start, end, size, predicate);
  quicksort(start, middle, size, predicate);
  quicksort((char*)middle + size, end, size, predicate);
}

void print(const int* start, int size) {
  for (int i = 0; i < size; ++i) printf("%3d", start[i]);
  printf("\n");
}

void rand_seed() {
  int   seed = 0;
  FILE* fp   = fopen("/dev/urandom", "re");
  if (!fp) {
    fprintf(stderr, "Warning: couldn't open source of randomness, falling back to time(NULL)");
    srand(time(NULL));
    return;
  }
  if (fread(&seed, sizeof(int), 1, fp) < 1) {
    fprintf(stderr, "Warning: couldn't read random seed, falling back to time(NULL)");
    fclose(fp);
    srand(time(NULL));
    return;
  }
  fclose(fp);
  srand(seed); // nice seed for rand()
}

int rand_range(int start, int end) {
  return start + rand() / (RAND_MAX / (end - start + 1) + 1);
}

int main() {
  // int demo
  rand_seed();
#define int_count 20
  int* ints = malloc(int_count * sizeof(int));
  if (!ints) {
    fprintf(stderr, "couldn't allocate memory");
    exit(EXIT_FAILURE);
  }
  for (int i = 0; i < int_count; ++i) ints[i] = rand_range(1, int_count / 2);
  print(ints, int_count);
  quicksort(ints, ints + int_count, sizeof(int), &cmp_ints_less);
  print(ints, int_count);
  free(ints);

  // string demo
  const char* strings[] = {
      "material", "rare",    "fade",      "aloof",  "way",  "torpid",
      "men",      "purring", "abhorrent", "unpack", "zinc", "unsightly",
  };
  const int str_count = sizeof(strings) / sizeof(strings[0]);
  quicksort(strings, strings + str_count, sizeof(char*), &cmp_strs_greater);
  for (int i = 0; i < str_count; ++i) printf("%s\n", strings[i]);

// double demo
#define dbl_count 20
  double doubles[dbl_count];
  for (int i = 0; i < dbl_count; ++i) doubles[i] = rand() / (RAND_MAX / 100.0);
  quicksort(doubles, doubles + dbl_count, sizeof(char*), &cmp_doubles_less);
  for (int i = 0; i < dbl_count; ++i) printf("%20.16f\n", doubles[i]);

  return EXIT_SUCCESS;
}

EDIT:

Just for reference Compiler Explorer reports the very obvious following assembly for alternative generic swap():

https://godbolt.org/z/GhvsY4

The sample main() there is:

int main() {
  int two = 2;
  int three = 3;

  swap(&two, &three, sizeof(int));
  swap2(&two, &three, sizeof(int));

  return two - three;
}

full assembler for swap2() below, but it is noteworthy that the compiler has inlined swap2() but not swap() which of course containes further calls to memcopy. This might be some (all?) of the difference?

swap2:
        push    rbp
        mov     rbp, rsp
        push    r14
        mov     r14, rdi
        push    r13
        mov     r13, rsi
        push    r12
        push    rbx
        cmp     rdx, 4
        je      .L9
        mov     r12, rdx
        cmp     rdx, 8
        jne     .L7
        mov     rax, QWORD PTR [rdi]
        mov     rdx, QWORD PTR [rsi]
        mov     QWORD PTR [rdi], rdx
        mov     QWORD PTR [rsi], rax
        lea     rsp, [rbp-32]
        pop     rbx
        pop     r12
        pop     r13
        pop     r14
        pop     rbp
        ret
.L7:
        lea     rax, [rdx+15]
        mov     rbx, rsp
        mov     rsi, rdi
        and     rax, -16
        sub     rsp, rax
        mov     rdi, rsp
        call    memcpy
        mov     rdx, r12
        mov     rsi, r13
        mov     rdi, r14
        call    memcpy
        mov     rdx, r12
        mov     rsi, rsp
        mov     rdi, r13
        call    memcpy
        mov     rsp, rbx
        lea     rsp, [rbp-32]
        pop     rbx
        pop     r12
        pop     r13
        pop     r14
        pop     rbp
        ret
.L9:
        mov     eax, DWORD PTR [rdi]
        mov     edx, DWORD PTR [rsi]
        mov     DWORD PTR [rdi], edx
        mov     DWORD PTR [rsi], eax
        lea     rsp, [rbp-32]
        pop     rbx
        pop     r12
        pop     r13
        pop     r14
        pop     rbp
        ret

All types that are equal to sizeof other types may not have same alignment, will it work for character literals and pointers? — IrAM, Dec 30 '20 at 12:48
Correct. A serious implementation would need to take care of that. As point out by @WhozCraig in comment under Andrew Henle 's answer this is solvable. It works on my architecture ( x86_64 ) for all the types in the `main()` above, which includes `pointers` and `doubles`, but for a cross platform implementation some additional care with alignment would be needed. — Oliver Schönrock, Dec 30 '20 at 13:12
*It works on my architecture ( x86_64 ) for all the types in the main() above* The problem is, you have to hope your compiler doesn't do something like use an SSE2 instruction to implement your code. There ***are*** potential alignment issues on x86, although waaaay too many programmers aren't aware of the possibilities. — Andrew Henle, Dec 30 '20 at 13:24
@AndrewHenle Thanks. I was certainly aware of touching on the edge of what would be compliant, let alone portable. Can it be made to be compliant? If it can't then that explains why I can achieve 2x faster? If it can, then it begs the question of why common `memcpy` is not doing something like this already? — Oliver Schönrock, Dec 30 '20 at 13:27

score 2 · Answer 1 · answered Dec 30 '20 at 12:44

2

Does this break any standards?

Yes.

It's a strict aliasing violation and can violate 6.3.2.3 Pointers, paragraph 7: "A pointer to an object type may be converted to a pointer to a different object type. If the resulting pointer is not correctly aligned for the referenced type, the behavior is undefined. ..."

answered Dec 30 '20 at 12:44

Andrew Henle

32,625
3
24
56

OK, but does that not violate the entire idea of a generic `void*` qsort for example? – Oliver Schönrock Dec 30 '20 at 12:48
Note that this too can be overcome by building a smarter swap-mechanic that detects leading off-alignment content, byte swaps that, then picks up on-alignment for the bulk of the content. This is not uncommon for memory movement algorithms (most decent memcpy implementations do this already). – WhozCraig Dec 30 '20 at 12:49
@OliverSchönrock no, paragraph 1 from the same section mentions `A pointer to void may be converted to or from a pointer to any object type. A pointer to any object type may be converted to a pointer to void and back again; the result shall compare equal to the original pointer.` – Chase Dec 30 '20 at 12:51
@WhozCraig Yes exactly that was my expectation! ie, how can I beat a modern `libc` by 2x without even trying too hard? I must be missing something? – Oliver Schönrock Dec 30 '20 at 12:52
@OliverSchönrock: This is a classical trade-off that you'll run into time and time again: Generic libraries work in most cases but once in a while you'll need to specialize to get the performance you need. – 500 - Internal Server Error Dec 30 '20 at 13:03
@500-InternalServerError Yes, that makes perfect sense, however the new suggested generic `swap()` implementation, if valid, gets the "best of both worlds"? Certainly in this use case, it performs on par with a type-specialised swap, but is generic and 2x faster than the "obvious" generic one right at top. – Oliver Schönrock Dec 30 '20 at 13:07
Will your version not be (marginally) slower if the size is in fact not that of either `int` or `long`? – 500 - Internal Server Error Dec 30 '20 at 13:10
@Chase An actual location in memory doesn't have type `void`, so the "pointer to `void`" that gets passed in this question is actually referring to *something else*. It doesn't matter what the pointer value is - it matters what the pointer references. And the strict aliasing rule means you can't treat a value as something it's not (except you can always treat anything as an array of `[[un]signed] char`). And that's exactly what the code in this question is trying to do - treat memory as something it's not. It usually works - but it's non-conforming. The alignment issue is entirely separate – Andrew Henle Dec 30 '20 at 13:16
(cont) and it's usually the alignment that causes problems. Just Google "`SIGBUS` ARM" or "`SIGBUS` SPARC". Not all hardware is as forgiving of alignment issues as x86 - and even that's not totally safe. – Andrew Henle Dec 30 '20 at 13:18
@AndrewHenle Thanks. So do you agree with WhozCraig that with some care this could be made compliant and cross platform? – Oliver Schönrock Dec 30 '20 at 13:23
@OliverSchönrock You're pretty much doing what every optimized `libc` implementation does. It'll never be strictly compliant because you're taking advantage of platform-specific features and breaking some rules such that you have to be really careful. Even down to looking at how your compiler implements your code. Yes, there are questions here where GCC used SSE instructions with alignment restrictions that caused failures on x86, so that's not immune. But there's no reason why it can't be made to work. – Andrew Henle Dec 30 '20 at 14:34
@AndrewHenle That makes good sense to me. I guess my main "surprise" in all this is that the `libc` which you get on a modern linux, ubuntu being perhaps one of the most common server OS's these days, does not appear to have such optimisations in its `memcpy` implementation? Am I wrong to expect that? – Oliver Schönrock Dec 30 '20 at 14:41
@AndrewHenle I have just added a godbolt link and assembler with notes as appendix to orig question above. Note that my new `swap()` is being inlined with `-O3` and the original `memcpy` is not. Does that explain at least some of the difference? – Oliver Schönrock Dec 30 '20 at 14:48

Improving performance of generic swap

Context

A better generic swap?

Question

1 Answers1