The following code shows a big performance difference of the two versions of min_3 on my machine (Windows 7, VC++ 2015, release).
#include <algorithm>
#include <chrono>
#include <iostream>
#include <random>
template <typename X>
const X& max_3_left( const X& a, const X& b, const X& c )
{
    return std::max( std::max( a, b ), c );
}
template <typename X>
const X& max_3_right( const X& a, const X& b, const X& c )
{
    return std::max( a, std::max( b, c ) );
}
int main()
{
    std::random_device r;
    std::default_random_engine e1( r() );
    std::uniform_int_distribution<int> uniform_dist( 1, 6 );
    std::vector<int> numbers;
    for ( int i = 0; i < 1000; ++i )
        numbers.push_back( uniform_dist( e1 ) );
    auto start1 = std::chrono::high_resolution_clock::now();
    int sum1 = 0;
    for ( int i = 0; i < 1000; ++i )
        for ( int j = 0; j < 1000; ++j )
            for ( int k = 0; k < 1000; ++k )
                sum1 += max_3_left( numbers[i], numbers[j], numbers[k] );
    auto finish1 = std::chrono::high_resolution_clock::now();
    std::cout << "left  " << sum1 << " " <<
        std::chrono::duration_cast<std::chrono::microseconds>(finish1 - start1).count()
        << " us" << std::endl;
    auto start2 = std::chrono::high_resolution_clock::now();
    int sum2 = 0;
    for ( int i = 0; i < 1000; ++i )
        for ( int j = 0; j < 1000; ++j )
            for ( int k = 0; k < 1000; ++k )
                sum2 += max_3_right( numbers[i], numbers[j], numbers[k] );
    auto finish2 = std::chrono::high_resolution_clock::now();
    std::cout << "right " << sum2 << " " <<
        std::chrono::duration_cast<std::chrono::microseconds>(finish2 - start2).count()
        << " us" << std::endl;
}
Output:
left  739861041 796056 us
right 739861041 1442495 us
On ideone the difference is smaller but still not negligible.
Why does this difference exist?