I have the following linear algebra function call (vector-vector addition) in C++.
int m = 4;
blasfeo_dvec one, two, three;
blasfeo_allocate_dvec(m, &one);
blasfeo_allocate_dvec(m, &two);
blasfeo_allocate_dvec(m, &three);
// initialize vectors ... (omitted)
blasfeo_daxpy(m, 1.0, &one, 0, &two, 0, &three, 0);
Using expression templates (ETs), we can wrap it as follows:
three = one + two;
where the vector struct looks like
struct blasfeo_dvec {
int m; // length
int pm; // packed length
double *pa; // pointer to a pm array of doubles, the first is aligned to cache line size
int memsize; // size of needed memory
void operator=(const vec_expression_sum<blasfeo_dvec, blasfeo_dvec> expr) {
blasfeo_daxpy(m, 1.0, (blasfeo_dvec *) &expr.vec_a, 0, (blasfeo_dvec *) &expr.vec_b, 0, this, 0);
}
};
The cast to non-const is necessary because blasfeo_daxpy takes non-const pointers. The ET code is simply
template<typename Ta, typename Tb>
struct vec_expression_sum {
const Ta vec_a;
const Tb vec_b;
vec_expression_sum(const Ta va, const Tb vb) : vec_a {va}, vec_b {vb} {}
};
template<typename Ta, typename Tb>
auto operator+(const Ta a, const Tb b) {
return vec_expression_sum<Ta, Tb>(a, b);
}
The 'native' call, i.e. blasfeo_daxpy(...) generates the following assembly:
; allocation and initialization omitted ...
movl $0, (%rsp)
movl $4, %edi
xorl %edx, %edx
xorl %r8d, %r8d
movsd LCPI0_0(%rip), %xmm0 ## xmm0 = mem[0],zero
movq %r14, %rsi
movq %rbx, %rcx
movq %r15, %r9
callq _blasfeo_daxpy
...
which is exactly what you would expect. The ET code is quite a bit longer:
; allocation :
leaq -120(%rbp), %rbx
movl $4, %edi
movq %rbx, %rsi
callq _blasfeo_allocate_dvec
leaq -96(%rbp), %r15
movl $4, %edi
movq %r15, %rsi
callq _blasfeo_allocate_dvec
leaq -192(%rbp), %r14
movl $4, %edi
movq %r14, %rsi
callq _blasfeo_allocate_dvec
; initialization code omitted
; operator+ :
movq -104(%rbp), %rax
movq %rax, -56(%rbp)
movq -120(%rbp), %rax
movq -112(%rbp), %rcx
movq %rcx, -64(%rbp)
movq %rax, -72(%rbp)
; vec_expression_sum :
movq -80(%rbp), %rax
movq %rax, -32(%rbp)
movq -96(%rbp), %rax
movq -88(%rbp), %rcx
movq %rcx, -40(%rbp)
movq %rax, -48(%rbp)
movq -32(%rbp), %rax
movq %rax, -128(%rbp)
movq -40(%rbp), %rax
movq %rax, -136(%rbp)
movq -48(%rbp), %rax
movq %rax, -144(%rbp)
movq -56(%rbp), %rax
movq %rax, -152(%rbp)
movq -72(%rbp), %rax
movq -64(%rbp), %rcx
movq %rcx, -160(%rbp)
movq %rax, -168(%rbp)
leaq -144(%rbp), %rcx
; blasfeo_daxpy :
movl -192(%rbp), %edi
movl $0, (%rsp)
leaq -168(%rbp), %rsi
xorl %edx, %edx
xorl %r8d, %r8d
movsd LCPI0_0(%rip), %xmm0 ## xmm0 = mem[0],zero
movq %r14, %r9
callq _blasfeo_daxpy
...
It involves quite a bit of copying, namely the fields of blasfeo_dvec. I (naively, maybe) hoped that the ET code would generate the exact same code as the native call, given that everything is fixed at compile time and const, but it doesn't.
The question is: why the extra loads? And is there a way of getting fully 'optimized' code? (edit: I use Apple LLVM version 8.1.0 (clang-802.0.42) with -std=c++14 -O3)
Note: I read and understood this and this post on a similar topic, but they unfortunately do not contain an answer to my question.