I've identified a function in a tightloop that's responsible for 50% of the time in my program(finding nearest neighbors). It calculates the euclidean distance between two unit vectors. Is there any way to make this run faster? (currently I am using gcc's -march=native and -ffast-math flags)
template<typename T>
  static inline T distance(const T* x, const T* y, int f) {
    T pp = 0, qq = 0, pq = 0;
    for (int z = 0; z < f; z++, x++, y++) {
      pp += (*x) * (*x);
      qq += (*y) * (*y);
      pq += (*x) * (*y);
    }
    T ppqq = pp * qq;
    if (ppqq > 0) return 2.0 - 2.0 * pq / sqrt(ppqq);
    else return 2.0;
  }