I have a piece of code that runs at ~1.2 million runs per second after performing some tasks, the bulkiest is setting a uint8_t array with bitshifted data from two uint32_t pieces of data. The excerpt code is as follows:
    static inline uint32_t RotateRight(uint32_t val, int n)
{
    return (val >> n) + (val << (32 - n));
}
static inline uint32_t CSUInt32BE(const uint8_t *b)
{
    return ((uint32_t)b[0] << 24) | ((uint32_t)b[1] << 16) | ((uint32_t)b[2] << 8) | (uint32_t)b[3];
}
static uint32_t ReverseBits(uint32_t val) // Usually just static, tried inline/static inline
{
    //  uint32_t res = 0;
    //  for (int i = 0; i<32; i++)
    //  {
    //      res <<= 1;
    //      res |= val & 1;
    //      val >>= 1;
    //  }
    // Original code above, benched ~220k l/s
    //val = ((val & 0x55555555) << 1) | ((val >> 1) & 0x55555555);
    //val = ((val & 0x33333333) << 2) | ((val >> 2) & 0x33333333);
    //val = ((val & 0x0F0F0F0F) << 4) | ((val >> 4) & 0x0F0F0F0F);
    //val = ((val & 0x00FF00FF) << 8) | ((val >> 8) & 0x00FF00FF);
    //val = (val << 16) | (val >> 16);
    // Option 0, benched ~770k on MBP
    uint32_t c = 0;
    c = (BitReverseTable256[val & 0xff] << 24) |
        (BitReverseTable256[(val >> 8) & 0xff] << 16) |
        (BitReverseTable256[(val >> 16) & 0xff] << 8) |
        (BitReverseTable256[val >> 24]); // was (val >> 24) & 0xff
                                         // Option 1, benched ~970k l/s on MBP, Current, minor tweak to 24
                                         //unsigned char * p = (unsigned char *)&val;
                                         //unsigned char * q = (unsigned char *)&c;
                                         //q[3] = BitReverseTable256[p[0]];
                                         //q[2] = BitReverseTable256[p[1]];
                                         //q[1] = BitReverseTable256[p[2]];
                                         //q[0] = BitReverseTable256[p[3]];
                                         // Option 2 at ~970k l/s on MBP from http://stackoverflow.com/questions/746171/best-algorithm-for-bit-reversal-from-msb-lsb-to-lsb-msb-in-c
    return c; // Current
              //    return val; // option 0
              //    return res; // original
              //uint32_t m;
              //val = (val >> 16) | (val << 16);                            // swap halfwords
              //m = 0x00ff00ff; val = ((val >> 8) & m) | ((val << 8) & ~m); // swap bytes
              //m = m^(m << 4); val = ((val >> 4) & m) | ((val << 4) & ~m); // swap nibbles
              //m = m^(m << 2); val = ((val >> 2) & m) | ((val << 2) & ~m);
              //m = m^(m << 1); val = ((val >> 1) & m) | ((val << 1) & ~m);
              //return val;
              // Benches at 850k l/s on MBP
              //uint32_t t;
              //val = (val << 15) | (val >> 17);
              //t = (val ^ (val >> 10)) & 0x003f801f;
              //val = (t + (t << 10)) ^ val;
              //t = (val ^ (val >>  4)) & 0x0e038421;
              //val = (t + (t <<  4)) ^ val;
              //t = (val ^ (val >>  2)) & 0x22488842;
              //val = (t + (t <<  2)) ^ val;
              //return val;
              // Benches at 820k l/s on MBP
}
static void StuffItDESCrypt(uint8_t data[8], StuffItDESKeySchedule *ks, BOOL enc)
{
uint32_t left = ReverseBits(CSUInt32BE(&data[0]));
uint32_t right = ReverseBits(CSUInt32BE(&data[4]));
right = RotateRight(right, 29);
left = RotateRight(left, 29);
//Encryption function runs here
left = RotateRight(left, 3);
right = RotateRight(right, 3);
uint32_t left1 = ReverseBits(left);
uint32_t right1 = ReverseBits(right);
data[0] = right1 >> 24;
data[1] = (right1 >> 16) & 0xff;
data[2] = (right1 >> 8) & 0xff;
data[3] = right1 & 0xff;
data[4] = left1 >> 24;
data[5] = (left1 >> 16) & 0xff;
data[6] = (left1 >> 8) & 0xff;
data[7] = left1 & 0xff;
Is this the most optimal way to accomplish this? I have a uint64_t version as well:
uint64_t both = ((uint64_t)ReverseBits(left) << 32) | (uint64_t)ReverseBits(right);
data[0] = (both >> 24 & 0xff);
data[1] = (both >> 16) & 0xff;
data[2] = (both >> 8) & 0xff;
data[3] = both & 0xff; 
data[4] = (both >> 56);
data[5] = (both >> 48) & 0xff;
data[6] = (both >> 40) & 0xff;
data[7] = (both >> 32) & 0xff;
I tested what would happen if I completely skipped this assignment (the ReverseBits function is still done), and the code runs at ~6.5 million runs per second. In addition, this speed hit happens if I only do just one as well, leveling out at 1.2 million even without touching the other 7 assignments.
I'd hate to think that this operation takes a massive 80% speed hit due to this work and can't be made any faster.
This is on Windows Visual Studio 2015 (though I try to keep the source as portable to macOS and Linux as possible).
Edit: The full base code is at Github. I am not the original author of the code, however I have forked it and maintain a password recovery solution using a modified for speed version. You can see my speed up successes in ReverseBits with various solutions and benched speeds.
These files are 20+ years old, and has successfully recovered files albeit at a low speed for years. See blog post.
 
     
    