#include <stdio.h>
#include <iostream>
#include <string>
#include <chrono>
#include <memory>
#include <cstdlib>
#include <cstdint>
#include <cstring>
#include <immintrin.h>
using namespace std;
const int p[9] =   {1, 10, 100, 
                    1000, 10000, 100000, 
                    1000000, 10000000, 100000000};
                    
class MyTimer {
 private:
  std::chrono::time_point<std::chrono::steady_clock> starter;
 public:
  void startCounter() {
    starter = std::chrono::steady_clock::now();
  }
  int64_t getCounterNs() {    
    return std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::steady_clock::now() - starter).count();
  }
};
                    
int convert1(const char *a) {
    int res = 0;
    for (int i=0; i<9; i++) res = res * 10 + a[i] - 48;
    return res;
}
int convert2(const char *a) {
    return (a[0] - 48) * p[8] + (a[1] - 48) * p[7] + (a[2] - 48) * p[6]
            + (a[3] - 48) * p[5] + (a[4] - 48) * p[4] + (a[5] - 48) * p[3]
            + (a[6] - 48) * p[2] + (a[7] - 48) * p[1] + (a[8] - 48) * p[0];
}
int convert3(const char *a) {
    return (a[0] - 48) * p[8] + a[1] * p[7] + a[2] * p[6] + a[3] * p[5]
            + a[4] * p[4] + a[5] * p[3] + a[6] * p[2] + a[7] * p[1] + a[8]
            - 533333328;
}
const unsigned pu[9] = {1, 10, 100, 1000, 10000, 100000, 1000000, 10000000,
    100000000};
int convert4u(const char *aa) {
  const unsigned char *a = (const unsigned char*) aa;
  return a[0] * pu[8] + a[1] * pu[7] + a[2] * pu[6] + a[3] * pu[5] + a[4] * pu[4]
      + a[5] * pu[3] + a[6] * pu[2] + a[7] * pu[1] + a[8] - (unsigned) 5333333328u;
}
int convert5(const char* a) {
    int val = 0;
    for(size_t k =0;k <9;++k) {
        val = (val << 3) + (val << 1) + (a[k]-'0');
    }
    return val;
}
const unsigned pu2[9] = {100000000, 10000000, 1000000, 100000, 10000, 1000, 100, 10, 1};
int convert6u(const char *a) {
  return a[0]*pu2[0] + a[1]*pu2[1] + a[2]*pu2[2] + a[3] * pu2[3] + a[4] * pu2[4] + a[5] * pu2[5] + a[6] * pu2[6] + a[7] * pu2[7] + a[8] - (unsigned) 5333333328u;
}
constexpr std::uint64_t zeros(char z) {
    std::uint64_t result = 0;
    for (int i = 0; i < sizeof(result); ++i) {
        result = result*256 + z;
    }
    return result;
}
int convertX(const char *a) {
    constexpr std::uint64_t offset = zeros('0');
    constexpr std::uint64_t o1 = 0xFF00FF00FF00FF00;
    constexpr std::uint64_t o2 = 0xFFFF0000FFFF0000;
    constexpr std::uint64_t o3 = 0xFFFFFFFF00000000;
    std::uint64_t buffer;
    std::memcpy(&buffer, a, sizeof(buffer));
    const auto bytes = buffer - offset;
    const auto b1 = (bytes & o1) >> 8;
    const auto words = (bytes & ~o1) + 10*b1;
    const auto w1 = (words & o2) >> 16;
    const auto dwords = (words & ~o2) + 100*w1;
    const auto d1 = (dwords & o3) >> 32;
    const auto qwords = (dwords & ~o3) + 1000*d1;
    const auto final = 10*static_cast<unsigned>(qwords) + (a[9] - '0');
    return static_cast<int>(final);
}
//########################  ACCEPTED ANSWER
//########################
//########################
typedef struct {             // for output into memory
    alignas(16) unsigned hours;
    unsigned minutes, seconds, nanos;
} hmsn;
void str2hmsn(hmsn *out, const char str[15])  // HHMMSSXXXXXXXXX  15 total, with 9-digit nanoseconds.
{    // 15 not including the terminating 0 (if any) which we don't read
    //hmsn retval;
    __m128i digs = _mm_loadu_si128((const __m128i*)str);
    digs = _mm_sub_epi8( digs, _mm_set1_epi8('0') );
    __m128i hms_x_words = _mm_maddubs_epi16( digs, _mm_set1_epi16( 10U + (1U<<8) ));   // SSSE3  pairs of digits => 10s, 1s places.
    __m128i hms_unpacked = _mm_cvtepu16_epi32(hms_x_words);                           // SSE4.1  hours, minutes, seconds unpack from uint16_t to uint32
    //_mm_storeu_si128((__m128i*)&retval, hms_unpacked);                                  // store first 3 struct members; last to be written separately
    _mm_storeu_si128((__m128i*)out, hms_unpacked);
    // or scalar extract with _mm_cvtsi128_si64 (movq) and shift / movzx
    __m128i xwords = _mm_bsrli_si128(hms_x_words, 6);  // would like to schedule this sooner, so oldest-uop-first starts this critical path shuffle ahead of pmovzx
    // 8 bytes of data, lined up in low 2 dwords, rather than split across high 3
    // could have got here with an 8-byte load that starts here, if we didn't want to get the H,M,S integers cheaply.
    __m128i xdwords = _mm_madd_epi16(xwords, _mm_setr_epi16(100, 1, 100, 1,  0,0,0,0));   // low/high uint32 chunks, discard the 9th x digit.
    uint64_t pair32 = _mm_cvtsi128_si64(xdwords);
    uint32_t msd = 100*100 * (uint32_t)pair32;     // most significant dword was at lower address (in printing order), so low half on little-endian x86.  encourage compilers to use 32-bit operand-size for imul
    uint32_t first8_x = msd + (uint32_t)(pair32 >> 32);
    uint32_t nanos = first8_x * 10 + ((unsigned char)str[14] - '0');   // total*10 + lowest digit
    out->nanos = nanos;
    //retval.nanos = nanos;
    //return retval;
  // returning the struct by value encourages compilers in the wrong direction
  // into not doing separate stores, even when inlining into a function that assigns the whole struct to a pointed-to output
}
hmsn mystruct;
int convertSIMD(const char* a)
{
    str2hmsn(&mystruct, a);
    return mystruct.nanos;
}
//########################
//########################
using ConvertFunc = int(const char*);
volatile int result = 0; // do something with the result of function to prevent unexpected optimization
void benchmark(ConvertFunc converter, string name, int numTest=1000) {
    MyTimer timer;
    const int N = 100000;
    char *a = new char[9*N + 17];
    int64_t runtime = 0;    
    for (int t=1; t<=numTest; t++) {        
        // change something to prevent unexpected optimization
        for (int i=0; i<9*N; i++) a[i] = rand() % 10 + '0'; 
        timer.startCounter();
        for (int i=0; i<9*N; i+= 9) result = converter(a+i);
        runtime += timer.getCounterNs();
    }
    cout << name << ": " << (runtime / (double(numTest) * N)) << "ns average\n";
    delete[] a;
}
int main() {
    benchmark(convert1, "slow");
    benchmark(convert2, "normal");    
    benchmark(convert3, "fast");
    benchmark(convert4u, "unsigned");
    benchmark(convert5, "shifting");
    benchmark(convert6u, "reverse");
    benchmark(convertX, "swar64");
    benchmark(convertSIMD, "manualSIMD");
    return 0;
}
I want to find the fastest way turn char a[9] into an int. The full problem is convert char a[15] with form HHMMSSxxxxxxxxx timestamp to nanosecond, where ~50 bytes after the x are allocated and can be safely read (but not write). We only care about the last 9 digits in this question.
Version 1 is basic, version 2,3 try to save some computation. I compile with -O3 flag, and storing power of 10s in array is fine because it is optimized away (checked using Godbolt).
How can I make this faster? Yes I know this sounds like premature optimization, but let's assume I need that final 2-3% boost.
Version 3 is decidedly faster, while version 2 is slower due to code size. But can we do better than version 3? Invalid benchmark
Edit 2: the function can return unsigned int instead of int (i.e
unsigned convert1(char *a);
Edit 3: I noticed that the new code is an invalid benchmark, since convert(a) is only executed once. Using the original code, the difference is only ~1%.
Edit 4: New benchmark. using unsigned (convert4u, convert6u) is consistently 3-5% faster than using int. I will run a long (10+ min) benchmark to see if there's a winner. I've edited the code to use a new benchmark. It generates a large amount of data, then run the converter functions.
Edit 5: results: 4.19, 4.51, 3.82, 3.59, 7.64, 3.72 seconds. The unsigned version is fastest. Is it possible to use SIMD on just 9 bytes? If not, then I guess this is the best solution. I still hope there's a crazier solution, though
Edit 6: benchmark result on AMD Ryzen 4350G, gcc version 10.3, compile command gcc -o main main.cpp -std=c++17 -O3 -mavx -mavx2 -march=native
slow: 4.17794ns average
normal: 2.59945ns average
fast: 2.27917ns average
unsigned: 2.43814ns average
shifting: 4.72233ns average
reverse: 2.2274ns average
swar64: 2.17179ns average
manualSIMD: 1.55203ns average
The accepted answer does even more than the question require and compute HH/MM/SS/nanosec, so it's even faster than this benchmark shows.
 
     
     
    