I have two functions below which in terms of execution should be identical. The only difference is that in the second function, funcB(), I moved the branch on (decobj == NULL) outside the loop which is always NULL anyway. I thought that the compiler would easily optimize funcA() so that it would have identical execution time to FuncB(), but that's not the case and there is a big difference in execution time:
FuncA took: 3644530 uS
FuncB took: 1664598 uS
FuncB took: 1626528 uS
FuncA took: 3108780 uS
Finished!
I am using Visual Studio 2019 (Preview), compiled x64 with full optimization enabled:
Does anyone know how to get Visual Studio to optimize funcA() to make it as fast as funcB()?
#include <iostream>
#include <chrono>
using namespace std;
struct DecoderObj {
    int decode(int i, int j) {
        return i + j;
    }
};
static DecoderObj* decobj = nullptr;
static const int iterations = 3000;
void funcA() {
    auto start = chrono::high_resolution_clock::now();
    for (int rep = 0; rep < iterations; rep++) {
        for (int i = 0; i < 999; i++) {
            for (int j = 0; j < 999; j++) {
                int a = decobj ? decobj->decode(i, j) : i - j;
                if (a > 99999) {
                    cout << "This should never print";
                    return;
                }
            }
        }
    }
    auto end = chrono::high_resolution_clock::now();
    cout << "FuncA took: " << chrono::duration_cast<chrono::microseconds>(end - start).count() << " uS" << endl;
}
void funcB() {
    auto start = chrono::high_resolution_clock::now();
    for (int rep = 0; rep < iterations; rep++) {
        if (decobj != nullptr) {
            for (int i = 0; i < 999; i++) {
                for (int j = 0; j < 999; j++) {
                    int a = decobj->decode(i, j);
                    if (a > 99999) {
                        cout << "This should never print";
                        return;
                    }
                }
            }
        }
        else { //decobj is NULL
            for (int i = 0; i < 999; i++) {
                for (int j = 0; j < 999; j++) {
                    int a = i - j;
                    if (a > 99999) {
                        cout << "This should never print";
                        return;
                    }
                }
            }
        }
    }
    auto end = chrono::high_resolution_clock::now();
    cout << "FuncB took: " << chrono::duration_cast<chrono::microseconds>(end - start).count() << " uS" << endl;
}
int main() {
    funcA();
    funcB();
    funcB();
    funcA();
    std::cout << "Finished!\n";
}
The other odd thing I noticed is that if I add decobj = nullptr; to the beginning of funcB(), the function actually slows down and takes about twice as long. Not sure why that would be the case as it's even more obvious for the compiler to completely eliminate the first branch and simply only generate code for the second branch. I don't see why doing that would have slowed down function execution.
Here is the assembly code for funcA() (I removed the timers):
; 17   :     for (int rep = 0; rep < iterations; rep++) {
    mov r9, QWORD PTR ?decobj@@3PEAUDecoderObj@@EA
    xor r10d, r10d
    npad    6
$LL4@funcA:
; 18   :         for (int i = 0; i < 999; i++) {
    xor r8d, r8d
    npad    13
$LL7@funcA:
; 19   :             for (int j = 0; j < 999; j++) {
    xor eax, eax
    mov edx, r8d
$LL10@funcA:
; 20   :                 int a = decobj ? decobj->decode(i, j) : i - j;
    lea ecx, DWORD PTR [rax+r8]
    test    r9, r9
    jne SHORT $LN14@funcA
    mov ecx, edx
$LN14@funcA:
; 21   :                 if (a > 99999) {
    cmp ecx, 99999              ; 0001869fH
    jg  SHORT $LN20@funcA
; 19   :             for (int j = 0; j < 999; j++) {
    inc eax
    dec edx
    cmp eax, 999                ; 000003e7H
    jl  SHORT $LL10@funcA
; 18   :         for (int i = 0; i < 999; i++) {
    inc r8d
    cmp r8d, 999                ; 000003e7H
    jl  SHORT $LL7@funcA
; 16   :     //auto start = chrono::high_resolution_clock::now();
; 17   :     for (int rep = 0; rep < iterations; rep++) {
    inc r10d
    cmp r10d, 3000              ; 00000bb8H
    jl  SHORT $LL4@funcA
; 23   :                     return;
; 24   :                 }
; 25   :             }
; 26   :         }
; 27   :     }
; 28   :     //auto end = chrono::high_resolution_clock::now();
; 29   :     //cout << "FuncA took: " << chrono::duration_cast<chrono::microseconds>(end - start).count() << " uS" << endl;
; 30   : }
    ret 0
$LN20@funcA:
; 22   :                     cout << "This should never print";
    mov rcx, QWORD PTR __imp_?cout@std@@3V?$basic_ostream@DU?$char_traits@D@std@@@1@A
    lea rdx, OFFSET FLAT:??_C@_0BI@DPJNIIKO@This?5should?5never?5print@
    jmp ??$?6U?$char_traits@D@std@@@std@@YAAEAV?$basic_ostream@DU?$char_traits@D@std@@@0@AEAV10@PEBD@Z ; std::operator<<<std::char_traits<char> >
?funcA@@YAXXZ ENDP                  ; funcA
Here is the assembly code for FuncB(). I also removed the timer here too:
_TEXT   SEGMENT
?funcB@@YAXXZ PROC                  ; funcB, COMDAT
; 33   :     //auto start = chrono::high_resolution_clock::now();
; 34   :     for (int rep = 0; rep < iterations; rep++) {
    mov r9, QWORD PTR ?decobj@@3PEAUDecoderObj@@EA
    xor r8d, r8d
    npad    6
$LL4@funcB:
; 35   :         if (decobj != nullptr) {
    xor edx, edx
    test    r9, r9
    je  SHORT $LL13@funcB
    npad    9
$LL7@funcB:
; 37   :                 for (int j = 0; j < 999; j++) {
    xor eax, eax
$LL10@funcB:
; 38   :                     int a = decobj->decode(i, j);
; 39   :                     if (a > 99999) {
    lea ecx, DWORD PTR [rax+rdx]
    cmp ecx, 99999              ; 0001869fH
    jg  SHORT $LN30@funcB
; 37   :                 for (int j = 0; j < 999; j++) {
    inc eax
    cmp eax, 999                ; 000003e7H
    jl  SHORT $LL10@funcB
; 36   :             for (int i = 0; i < 999; i++) {
    inc edx
    cmp edx, 999                ; 000003e7H
    jl  SHORT $LL7@funcB
; 40   :                         cout << "This should never print";
; 41   :                         return;
; 42   :                     }
; 43   :                 }
; 44   :             }
; 45   :         }
    jmp SHORT $LN2@funcB
$LL13@funcB:
; 48   :                 for (int j = 0; j < 999; j++) {
    xor ecx, ecx
    mov eax, edx
$LL16@funcB:
; 49   :                     int a = i - j;
; 50   :                     if (a > 99999) {
    cmp eax, 99999              ; 0001869fH
    jg  SHORT $LN30@funcB
; 48   :                 for (int j = 0; j < 999; j++) {
    inc ecx
    dec eax
    cmp ecx, 999                ; 000003e7H
    jl  SHORT $LL16@funcB
; 46   :         else { //decobj is NULL
; 47   :             for (int i = 0; i < 999; i++) {
    inc edx
    cmp edx, 999                ; 000003e7H
    jl  SHORT $LL13@funcB
$LN2@funcB:
; 33   :     //auto start = chrono::high_resolution_clock::now();
; 34   :     for (int rep = 0; rep < iterations; rep++) {
    inc r8d
    cmp r8d, 3000               ; 00000bb8H
    jge SHORT $LN3@funcB
    jmp SHORT $LL4@funcB
$LN30@funcB:
; 51   :                         cout << "This should never print";
; 52   :                         return;
; 53   :                     }
; 54   :                 }
; 55   :             }
; 56   :         }
; 57   :     }
; 58   :    // auto end = chrono::high_resolution_clock::now();
; 59   :     //cout << "FuncB took: " << chrono::duration_cast<chrono::microseconds>(end - start).count() << " uS" << endl;
; 60   : }
    mov rcx, QWORD PTR __imp_?cout@std@@3V?$basic_ostream@DU?$char_traits@D@std@@@1@A
    lea rdx, OFFSET FLAT:??_C@_0BI@DPJNIIKO@This?5should?5never?5print@
    jmp ??$?6U?$char_traits@D@std@@@std@@YAAEAV?$basic_ostream@DU?$char_traits@D@std@@@0@AEAV10@PEBD@Z ; std::operator<<<std::char_traits<char> >
$LN3@funcB:
    ret 0
?funcB@@YAXXZ ENDP                  ; funcB
_TEXT   ENDS
The code for the two functions is very different even though I think the compiler should have generated the same code for both. It should have realized that decobj will always be NULL and never generated code for that branch. Both funcA and funcB test the decobj pointer for NULL.
I also ran the Performance Profiler and it also shows funcA() spending time on the decode obj NULL check:
Here is the profile for FuncB():



