I have been trying to understand the practical usages of TMP. I see a lot of code along the following lines:
#ifndef LOOP2_HPP
#define LOOP2_HPP
// primary template
template <int DIM, typename T>
class DotProduct {
  public:
    static T result (T* a, T* b) {
        return *a * *b  +  DotProduct<DIM-1,T>::result(a+1,b+1);
    }
};
// partial specialization as end criteria
template <typename T>
class DotProduct<1,T> {
  public:
    static T result (T* a, T* b) {
        return *a * *b;
    }
};
// convenience function
template <int DIM, typename T>
inline T dot_product (T* a, T* b)
{
    return DotProduct<DIM,T>::result(a,b);
}
Is it a good practice to always explicitly inline such heavily recursive functions?
EDIT:
For a more concrete example take the following code:
template <int N>
inline void f() {
    f<N-1>();
    std::cout << N << "\n";
}
template <>
void f<0>() {
    std::cout << 0 << "\n";
};
int main() {
      f<1>();
    return 0;
}
I just want to use the function f as a way to unroll a bunch of cout statements which I don't want to write at compile time. Following is the assembly generated by gcc-8.3, all optimizations enabled:
        void f<0>():
            push    rbp
            mov     rbp, rsp
            mov     esi, 0
            mov     edi, OFFSET FLAT:_ZSt4cout
            call    std::basic_ostream<char, std::char_traits<char> >::operator<<(int)
            mov     esi, OFFSET FLAT:.LC0
            mov     rdi, rax
            call    std::basic_ostream<char, std::char_traits<char> >& s
td::operator<< <std::char_traits<char> >(std::basic_ostream<char, std::char_traits<char> >&, char const*)
        nop
        pop     rbp
        ret
main:
        push    rbp
        mov     rbp, rsp
        call    void f<1>()
        mov     eax, 0
        pop     rbp
        ret
void f<1>():
        push    rbp
        mov     rbp, rsp
        call    void f<0>()
        mov     esi, 1
        mov     edi, OFFSET FLAT:_ZSt4cout
        call    std::basic_ostream<char, std::char_traits<char> >::operator<<(int)
        mov     esi, OFFSET FLAT:.LC0
        mov     rdi, rax
        call    std::basic_ostream<char, std::char_traits<char> >& std::operator<< <std::char_traits<char> >(std::basic_ostream<char, std::char_traits<char> >&, char const*)
        nop
        pop     rbp
        ret
It seems that each of the unrolling leads to a runtime call instruction. It is this cost I want to avoid. I just want the final generated code to be a concatenation of multiple couts.
 
    