Ran a benchmark and uint32_fast_t is 8 byte but slower than 4 byte uint32_t for nearly all operations. If this is the case why does uint32_fast_t not stay as 4 bytes?
OS info: 5.3.0-62-generic #56~18.04.1-Ubuntu SMP Wed Jun 24 16:17:03 UTC 2020 x86_64 x86_64 x86_64 GNU/Linux
Cpu info:
cat /sys/devices/cpu/caps/pmu_name
skylake
model name  : Intel(R) Core(TM) i7-8565U CPU @ 1.80GHz
Benchmark I used for testing:
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <cstdint>
#define TEST_SIZE (1u << 26)
#define ADD(X, Y)     X += Y
#define SUB(X, Y)     X -= Y
#define MULT(X, Y)    X *= Y
#define DIV(X, Y)     X /= Y
#define MOD(X, Y)     X = X % Y
#define AND(X, Y)     X &= Y
#define OR(X, Y)      X |= Y
#define XOR(X, Y)     X ^= Y
// if you compile this make sure to have -DOP=<Operation macro name here>
#define bench_flush_all_pending()    asm volatile("" : : : "memory");
#define bench_do_not_optimize_out(X) asm volatile("" : : "r,m"(X) : "memory")
uint64_t inline __attribute__((always_inline)) __attribute__((const))
get_cycles() {
    uint32_t hi, lo;
    __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi));
    return (((uint64_t)lo) | (((uint64_t)hi) << 32));
}
constexpr uint32_t
size_ratio() {
    return sizeof(uint_fast32_t) / sizeof(uint32_t);
}
int
main(int argc, char ** argv) {
    if (argc < 2) {
        fprintf(stderr, "Usage: ./%s <A or B>\n", argv[0]);
    }
    uint_fast32_t * valsf32 =
        (uint_fast32_t *)calloc(TEST_SIZE, sizeof(uint_fast32_t));
    for (uint32_t i = 0; i < TEST_SIZE; ++i) {
        valsf32[i] = rand();
    }
    uint32_t * vals32 = (uint32_t *)valsf32;
    uint_fast32_t sinkf32   = rand();
    uint32_t      sink32    = rand();
    uint64_t      start, end;
    if (argv[1][0] == 'A') {
        start = get_cycles();
#ifndef DO_NOTHING
        for (uint32_t i = 0; i < TEST_SIZE; ++i) {
            OP(sinkf32, valsf32[i]);
        }
        bench_do_not_optimize_out(sinkf32);
        bench_flush_all_pending();
#endif
        end = get_cycles();
    }
    else if (argv[1][0] == 'B') {
        start = get_cycles();
#ifndef DO_NOTHING
        for (uint32_t i = 0; i < TEST_SIZE; ++i) {
            OP(sink32, vals32[size_ratio() * i]);
        }
        bench_do_not_optimize_out(sink32);
        bench_flush_all_pending();
#endif
        end = get_cycles();
    }
    else {
        bench_do_not_optimize_out(sinkf32);
        bench_do_not_optimize_out(sink32);
    }
    fprintf(stderr,
            "\t%s (%zu Bytes): %.2E \"Cycles\" Per operator\n",
            argv[1][0] == 'A' ? "Fast u32" : "Norm u32",
            argv[1][0] == 'A' ? sizeof(uint_fast32_t) : sizeof(uint32_t),
            ((double)(end - start)) / TEST_SIZE);
    free(valsf32);
}
I compiled this with:
g++ -O3 test_operations.cc -o test_ops -DOP=<Operation macro name here> 
i.e to test addition:
g++ -O3 test_operations.cc -o test_ops -DOP=ADD
To get a baseline just include -DDO_NOTHING
I'm hoping someone could either explain the advantage of having uint32_fast_t as 8 bytes or something I missed in this benchmark so that I can actually get a good comparison.
The same holds with -fno-unroll-loops
Here is a example run:
Running: test_ADD Fast u32
    Fast u32 (8 Bytes): 8.52E-01 "Cycles" Per operator
Running: test_ADD Norm u32
    Norm u32 (4 Bytes): 7.92E-01 "Cycles" Per operator
Running: test_SUB Fast u32
    Fast u32 (8 Bytes): 8.34E-01 "Cycles" Per operator
Running: test_SUB Norm u32
    Norm u32 (4 Bytes): 7.94E-01 "Cycles" Per operator
Running: test_MULT Fast u32
    Fast u32 (8 Bytes): 2.54E+00 "Cycles" Per operator
Running: test_MULT Norm u32
    Norm u32 (4 Bytes): 1.26E+00 "Cycles" Per operator
Running: test_DIV Fast u32
    Fast u32 (8 Bytes): 1.48E+01 "Cycles" Per operator
Running: test_DIV Norm u32
    Norm u32 (4 Bytes): 1.09E+01 "Cycles" Per operator
Running: test_MOD Fast u32
    Fast u32 (8 Bytes): 1.52E+01 "Cycles" Per operator
Running: test_MOD Norm u32
    Norm u32 (4 Bytes): 1.20E+01 "Cycles" Per operator
Running: test_AND Fast u32
    Fast u32 (8 Bytes): 8.30E-01 "Cycles" Per operator
Running: test_AND Norm u32
    Norm u32 (4 Bytes): 7.98E-01 "Cycles" Per operator
Running: test_OR Fast u32
    Fast u32 (8 Bytes): 8.30E-01 "Cycles" Per operator
Running: test_OR Norm u32
    Norm u32 (4 Bytes): 8.00E-01 "Cycles" Per operator
Running: test_XOR Fast u32
    Fast u32 (8 Bytes): 8.29E-01 "Cycles" Per operator
Running: test_XOR Norm u32
    Norm u32 (4 Bytes): 7.95E-01 "Cycles" Per operator
Running: test_NOTHING Fast u32
    Fast u32 (8 Bytes): 2.09E-07 "Cycles" Per operator
Running: test_NOTHING Norm u32
    Norm u32 (4 Bytes): 2.09E-07 "Cycles" Per operator 
updated initialization function:
    uint_fast32_t * valsf32 =
        (uint_fast32_t *)calloc(TEST_SIZE, sizeof(uint_fast32_t));
    for (uint32_t i = 0; i < TEST_SIZE; ++i) {
        valsf32[i] = (uint32_t)rand(); // rand max is already an int by why not
        valsf32[i] += (valsf32[i] == 0);
        assert(valsf32[i]);
    }
    uint32_t * vals32 = (uint32_t *)calloc(TEST_SIZE, sizeof(uint_fast32_t));
    for (uint32_t i = 0; i < TEST_SIZE; ++i) {
        vals32[size_ratio() * i] = (uint32_t)valsf32[i];
        vals32[size_ratio() * i] += (vals32[size_ratio() * i] == 0);
        assert(vals32[size_ratio() * i]);
        assert(vals32[size_ratio() * i] == valsf32[i]);
    }
Results:
Running: test_ADD Fast u32
    Fast u32 (8 Bytes): 8.27E-01 "Cycles" Per operator
Running: test_ADD Norm u32
    Norm u32 (4 Bytes): 7.83E-01 "Cycles" Per operator
Running: test_SUB Fast u32
    Fast u32 (8 Bytes): 8.29E-01 "Cycles" Per operator
Running: test_SUB Norm u32
    Norm u32 (4 Bytes): 7.72E-01 "Cycles" Per operator
Running: test_MULT Fast u32
    Fast u32 (8 Bytes): 2.55E+00 "Cycles" Per operator
Running: test_MULT Norm u32
    Norm u32 (4 Bytes): 1.28E+00 "Cycles" Per operator
Running: test_DIV Fast u32
    Fast u32 (8 Bytes): 1.49E+01 "Cycles" Per operator
Running: test_DIV Norm u32
    Norm u32 (4 Bytes): 1.10E+01 "Cycles" Per operator
Running: test_MOD Fast u32
    Fast u32 (8 Bytes): 1.53E+01 "Cycles" Per operator
Running: test_MOD Norm u32
    Norm u32 (4 Bytes): 1.25E+01 "Cycles" Per operator
Running: test_AND Fast u32
    Fast u32 (8 Bytes): 8.35E-01 "Cycles" Per operator
Running: test_AND Norm u32
    Norm u32 (4 Bytes): 8.34E-01 "Cycles" Per operator
Running: test_OR Fast u32
    Fast u32 (8 Bytes): 8.31E-01 "Cycles" Per operator
Running: test_OR Norm u32
    Norm u32 (4 Bytes): 7.76E-01 "Cycles" Per operator
Running: test_XOR Fast u32
    Fast u32 (8 Bytes): 8.34E-01 "Cycles" Per operator
Running: test_XOR Norm u32
    Norm u32 (4 Bytes): 7.82E-01 "Cycles" Per operator
Running: test_NOTHING Fast u32
    Fast u32 (8 Bytes): 1.79E-07 "Cycles" Per operator
Running: test_NOTHING Norm u32
    Norm u32 (4 Bytes): 1.79E-07 "Cycles" Per operator
To test memory bandwidth added a TOUCH operator:
#define TOUCH(X, Y) X = Y
Results of TOUCH:
    Fast u32 (8 Bytes): 1.05E+00 "Cycles" Per operator
    Norm u32 (4 Bytes): 1.04E+00 "Cycles" Per operator
Added:
#define THREE_WAY ((A * vals8[i] + B) * vals8[i] + C)
#define TW(X, Y) X ^= THREE_WAY
// A, B, C, vals8, and i are all define
...
    uint8_t * vals8 = (uint8_t *)calloc(TEST_SIZE, 1);
    for (uint32_t i = 0; i < TEST_SIZE; ++i) {
        vals8[i] = rand();
    }
...
       if (argv[1][0] == 'A') {
        uint_fast32_t A = rand();
        uint_fast32_t B = rand();
        uint_fast32_t C = rand();
...
    else if (argv[1][0] == 'B') {
        uint32_t A = rand();
        uint32_t B = rand();
        uint32_t C = rand();
Result of OP=TW
    Fast u32 (8 Bytes): 2.01E+00 "Cycles" Per operator
    Norm u32 (4 Bytes): 9.48E-01 "Cycles" Per operator
Edit 6: Added CPU info.
