I came up with three solutions so far:
The extremely inefficient standard library pow and log2 functions:
int_fast16_t powlog(uint_fast16_t n)
{
  return static_cast<uint_fast16_t>(pow(2, floor(log2(n))));
}
Far more efficient counting subsequent powers of 2 until I reach a greater number than I had to reach:
uint_fast16_t multiply(uint_fast16_t n)
{
  uint_fast16_t maxpow = 1;
  while(2*maxpow <= n)
    maxpow *= 2;
  return maxpow;
}
The most efficient so far binsearching a precomputed table of powers of 2:
uint_fast16_t binsearch(uint_fast16_t n)
{
  static array<uint_fast16_t, 20> pows {1,2,4,8,16,32,64,128,256,512,
    1024,2048,4096,8192,16384,32768,65536,131072,262144,524288};
  return *(upper_bound(pows.begin(), pows.end(), n)-1);
}
Can this be optimized even more? Any tricks that could be used here?
Full benchmark I used:
#include <iostream>
#include <chrono>
#include <cmath>
#include <cstdint>
#include <array>
#include <algorithm>
using namespace std;
using namespace chrono;
uint_fast16_t powlog(uint_fast16_t n)
{
  return static_cast<uint_fast16_t>(pow(2, floor(log2(n))));
}
uint_fast16_t multiply(uint_fast16_t n)
{
  uint_fast16_t maxpow = 1;
  while(2*maxpow <= n)
    maxpow *= 2;
  return maxpow;
}
uint_fast16_t binsearch(uint_fast16_t n)
{
  static array<uint_fast16_t, 20> pows {1,2,4,8,16,32,64,128,256,512,
    1024,2048,4096,8192,16384,32768,65536,131072,262144,524288};
  return *(upper_bound(pows.begin(), pows.end(), n)-1);
}
high_resolution_clock::duration test(uint_fast16_t(powfunct)(uint_fast16_t))
{
  auto tbegin = high_resolution_clock::now();
  volatile uint_fast16_t sink;
  for(uint_fast8_t i = 0; i < UINT8_MAX; ++i)
    for(uint_fast16_t n = 1; n <= 999999; ++n)
      sink = powfunct(n);
  auto tend = high_resolution_clock::now();
  return tend - tbegin;
}
int main()
{
  cout << "Pow and log took " << duration_cast<milliseconds>(test(powlog)).count() << " milliseconds." << endl;
  cout << "Multiplying by 2 took " << duration_cast<milliseconds>(test(multiply)).count() << " milliseconds." << endl;
  cout << "Binsearching precomputed table of powers took " << duration_cast<milliseconds>(test(binsearch)).count() << " milliseconds." << endl;
}
Compiled with -O2 this gave the following results on my laptop:
Pow and log took 19294 milliseconds.
Multiplying by 2 took 2756 milliseconds.
Binsearching precomputed table of powers took 2278 milliseconds.
 
     
     
     
     
    