I am studying AVX by writing AVX code with inline assembly. In this case, I tried to implement AVX in a simple function. The function name I made is lower_all_chars_base.
Its behavior is: Apply logical OR to every single char in std::string with 0x20.
- Let cbe every single char in the input.
- Assuming the input only contains chars in this set 'A' <= c && c <= 'Z'.
So the function will make the characters be lower case.
I tried to make the AVX version of the function, the store instruction was unaligned, and there was no speed up at all.
Then I thought, if the memory access is aligned, then it must be faster. After that, I tried to make the AVX version with aligned store, but still gcc base optimization -O3 beats up my vectorized code by hand. What am I doing wrong here?
Functions Summary
- lower_all_chars_basesimple function.
- lower_all_chars_avx_alignedAVX2 aligned move version:
- Process first unaligned memory with base operation.
- Then process aligned memory part with AVX2 and aligned move.
- Then the rest with base operation again.
- lower_all_chars_avx_unalignedAVX2 unaligned move version:
- Process the data with AVX2 and unaligned move
- Then the rest with base operation.
Questions
- Why does gccbase optimization-O3beat up my optimization?
- What am I doing wrong here?
- What is the proper AVX operation to do this?
Benchmark Result
- CPU: Intel(R) Xeon(R) CPU E5-2650 v4 (2.20GHz)
- Microarchitecture: Broadwell
root@esteh:/tmp# g++ --version
g++ (Ubuntu 9.3.0-17ubuntu1~20.04) 9.3.0
Copyright (C) 2019 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
root@esteh:/tmp# g++ -Wall -Wextra -std=c++2a -O3 test.cpp -o test
root@esteh:/tmp# nice -n -20 ./test
lower_all_chars_base
Min   =  0.00662300
Max   =  0.00793100
Avg   =  0.00717280
Total =  0.07172800
lower_all_chars_avx_aligned
Min   =  0.00650200
Max   =  0.00785100
Avg   =  0.00726220
Total =  0.07262200
lower_all_chars_avx_unaligned
Min   =  0.00623600
Max   =  0.00835000
Avg   =  0.00701360
Total =  0.07013600
Code
Edit: N - 1 for the memset.
Godbolt link: https://godbolt.org/z/a16cGK
#include <ctime>
#include <cstdio>
#include <cassert>
#include <cstring>
#include <iostream>
using std::string;
void lower_all_chars_base(string &str);
void lower_all_chars_avx_aligned(string &str);
void lower_all_chars_avx_unaligned(string &str);
void do_benchmark(std::string &x, void (*fx)(string &));
void mem_flush(const void *p, unsigned int allocation_size);
#define N (size_t)(1024u * 1024 * 40)
#define BENCHMARK(STR, FX) do { \
 puts(#FX); \
 do_benchmark(STR, FX); \
} while(0)
int main() {
  static char x[N];
  memset(x, 'A', N - 1);
  string a(x), b(x), c(x);
  
  BENCHMARK(a, lower_all_chars_base);
  BENCHMARK(b, lower_all_chars_avx_aligned);
  BENCHMARK(c, lower_all_chars_avx_unaligned);
  assert(a == b);
  assert(b == c);
  memset(x, 'a', N - 1);
  assert(memcmp(c.c_str(), x, N - 1) == 0);
}
void do_benchmark(std::string &x, void (*fx)(string &)) {
  const size_t n = 10;
  double min, max, avg, c, total = 0;
  for (size_t i = 0; i < n; i++) {
    clock_t time0 = clock();
    fx(x);
    clock_t time1 = clock();
    c = (double)(time1 - time0) / CLOCKS_PER_SEC;
    total += c;
    if (i == 0) {
      min = max = c;
    } else {
      if (c > max) max = c;
      if (c < min) min = c;
    }
    mem_flush(x.c_str(), x.size());
  }
  avg = total / (double)n;
  printf("Min   =  %.8f\n", min);
  printf("Max   =  %.8f\n", max);
  printf("Avg   =  %.8f\n", avg);
  printf("Total =  %.8f\n\n", total);
}
__attribute__((noinline))
void lower_all_chars_base(string &str) {
  char *cs = (char *)str.c_str();
  size_t len = str.size();
  while (len--) {
    *cs++ |= 0x20;
  }
}
static const uint64_t mask[] __attribute__((aligned(32))) = {
  0x2020202020202020ull, 0x2020202020202020ull,
  0x2020202020202020ull, 0x2020202020202020ull
};
__attribute__((noinline))
void lower_all_chars_avx_aligned(string &str) {
  char *cs = (char *)str.c_str();
  size_t len = str.size();
  /* Only use AVX for data bigger than 4K. */
  if (len > 4096) {
    /* Handle unaligned data from the head. */
    uint8_t n = (uintptr_t)cs & 0b11111u;
    for (uint8_t i = 0; i < n; i++) {
      *cs++ |= 0x20;
    }
    len -= n;
    /* Prevent AVX to process data beyond the array. */
    size_t vlen = len - 288;
    size_t j;
    /* Process the aligned memory with AVX. */
    asm volatile("vmovdqa %[mask], %%ymm0"::[mask]"m"(mask):"ymm0");
    for (j = 0; j < vlen; j += 288) {
      asm volatile(
        "vpor\t(%[cs],%[j]), %%ymm0, %%ymm1\n\t"
        "vpor\t32(%[cs],%[j]), %%ymm0, %%ymm2\n\t"
        "vpor\t64(%[cs],%[j]), %%ymm0, %%ymm3\n\t"
        "vpor\t96(%[cs],%[j]), %%ymm0, %%ymm4\n\t"
        "vpor\t128(%[cs],%[j]), %%ymm0, %%ymm5\n\t"
        "vpor\t160(%[cs],%[j]), %%ymm0, %%ymm6\n\t"
        "vpor\t192(%[cs],%[j]), %%ymm0, %%ymm7\n\t"
        "vpor\t224(%[cs],%[j]), %%ymm0, %%ymm8\n\t"
        "vpor\t256(%[cs],%[j]), %%ymm0, %%ymm9\n\t"
        "vmovdqa\t%%ymm1, (%[cs],%[j])\n\t"
        "vmovdqa\t%%ymm2, 32(%[cs],%[j])\n\t"
        "vmovdqa\t%%ymm3, 64(%[cs],%[j])\n\t"
        "vmovdqa\t%%ymm4, 96(%[cs],%[j])\n\t"
        "vmovdqa\t%%ymm5, 128(%[cs],%[j])\n\t"
        "vmovdqa\t%%ymm6, 160(%[cs],%[j])\n\t"
        "vmovdqa\t%%ymm7, 192(%[cs],%[j])\n\t"
        "vmovdqa\t%%ymm8, 224(%[cs],%[j])\n\t"
        "vmovdqa\t%%ymm9, 256(%[cs],%[j])"
        :
        : [cs]"p"(cs), [j]"r"(j)
        : "memory", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
          "ymm6", "ymm7", "ymm8", "ymm9"
      );
    }
    asm volatile("vzeroupper":::
      "ymm0", "ymm1", "ymm2", "ymm3",
      "ymm4", "ymm5", "ymm6", "ymm7",
      "ymm8", "ymm9", "ymm10", "ymm11",
      "ymm12","ymm13","ymm14","ymm15"
    );
    cs  += j;
    len -= j;
  }
  /* Backup remaining elements from the AVX operation. */
  for (size_t i = 0; i < len; i++) {
    *cs++ |= 0x20;
  }
}
__attribute__((noinline))
void lower_all_chars_avx_unaligned(string &str) {
  char *cs = (char *)str.c_str();
  size_t len = str.size();
  /* Only use AVX for data bigger than 4K. */
  if (len > 4096) {
    size_t j;
    size_t vlen  = len - 288;
    asm volatile("vmovdqa %[mask], %%ymm0"::[mask]"m"(mask):"ymm0");
    for (j = 0; j < vlen; j += 288) {
      asm volatile(
        "vpor\t(%[cs],%[j]), %%ymm0, %%ymm1\n\t"
        "vpor\t32(%[cs],%[j]), %%ymm0, %%ymm2\n\t"
        "vpor\t64(%[cs],%[j]), %%ymm0, %%ymm3\n\t"
        "vpor\t96(%[cs],%[j]), %%ymm0, %%ymm4\n\t"
        "vpor\t128(%[cs],%[j]), %%ymm0, %%ymm5\n\t"
        "vpor\t160(%[cs],%[j]), %%ymm0, %%ymm6\n\t"
        "vpor\t192(%[cs],%[j]), %%ymm0, %%ymm7\n\t"
        "vpor\t224(%[cs],%[j]), %%ymm0, %%ymm8\n\t"
        "vpor\t256(%[cs],%[j]), %%ymm0, %%ymm9\n\t"
        "vmovdqu\t%%ymm1, (%[cs],%[j])\n\t"
        "vmovdqu\t%%ymm2, 32(%[cs],%[j])\n\t"
        "vmovdqu\t%%ymm3, 64(%[cs],%[j])\n\t"
        "vmovdqu\t%%ymm4, 96(%[cs],%[j])\n\t"
        "vmovdqu\t%%ymm5, 128(%[cs],%[j])\n\t"
        "vmovdqu\t%%ymm6, 160(%[cs],%[j])\n\t"
        "vmovdqu\t%%ymm7, 192(%[cs],%[j])\n\t"
        "vmovdqu\t%%ymm8, 224(%[cs],%[j])\n\t"
        "vmovdqu\t%%ymm9, 256(%[cs],%[j])"
        :
        : [cs]"p"(cs), [j]"r"(j)
        : "memory", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
          "ymm6", "ymm7", "ymm8", "ymm9"
      );
    }
    asm volatile("vzeroupper":::
      "ymm0", "ymm1", "ymm2", "ymm3",
      "ymm4", "ymm5", "ymm6", "ymm7",
      "ymm8", "ymm9", "ymm10", "ymm11",
      "ymm12","ymm13","ymm14","ymm15"
    );
    cs  += j;
    len -= j;
  }
  /* Backup remaining elements from the AVX operation. */
  for (size_t i = 0; i < len; i++) {
    *cs++ |= 0x20;
  }
}
void mem_flush(const void *p, unsigned int allocation_size) {
  /* https://stackoverflow.com/a/43694725/7275114 */
  const size_t cache_line = 64;
  const char *cp = (const char *)p;
  size_t i = 0;
  if (p == NULL || allocation_size <= 0)
    return;
  for (i = 0; i < allocation_size; i += cache_line) {
    asm volatile("clflush (%0)"::"r"(&cp[i]):"memory");
  }
  asm volatile("sfence"::: "memory");
}
 
     
    