I am optimizing an algorithm and I am considering using Vector over double for a multiply and accumulate operation. The implementation the closest is obviously a Vector.dot(v1, v2);... BUT, why is my code so slow?
namespace ConsoleApp1 {
    class Program {
        public static double SIMDMultAccumulate(double[] inp1, double[] inp2) {
            var simdLength = Vector<double>.Count;
            var returnDouble = 0d;
            // Find the max and min for each of Vector<ushort>.Count sub-arrays 
            var i = 0;
            for (; i <= inp1.Length - simdLength; i += simdLength) {
                var va = new Vector<double>(inp1, i);
                var vb = new Vector<double>(inp2, i);
                returnDouble += Vector.Dot(va, vb);
            }
            // Process any remaining elements
            for (; i < inp1.Length; ++i) {
                var va = new Vector<double>(inp1, i);
                var vb = new Vector<double>(inp2, i);
                returnDouble += Vector.Dot(va, vb);
            }
            return returnDouble;
        }
        public static double NonSIMDMultAccumulate(double[] inp1, double[] inp2) {
            var returnDouble = 0d;
            for (int i = 0; i < inp1.Length; i++) {
                returnDouble += inp1[i] * inp2[i];
            }
            return returnDouble;
        }
        static void Main(string[] args) {
            Console.WriteLine("Is hardware accelerated: " + Vector.IsHardwareAccelerated);
            const int size = 24;
            var inp1 = new double[size];
            var inp2 = new double[size];
            var random = new Random();
            for (var i = 0; i < inp1.Length; i++) {
                inp1[i] = random.NextDouble();
                inp2[i] = random.NextDouble();
            }
            var sumSafe = 0d;
            var sumFast = 0d;
            var sw = Stopwatch.StartNew();
            for (var i = 0; i < 10; i++) {
                sumSafe =  NonSIMDMultAccumulate(inp1, inp2);
            }
            Console.WriteLine("{0} Ticks", sw.Elapsed.Ticks);
            sw.Restart();
            for (var i = 0; i < 10; i++) {
                sumFast = SIMDMultAccumulate(inp1, inp2);
            }
            Console.WriteLine("{0} Ticks", sw.Elapsed.Ticks);
//            Assert.AreEqual(sumSafe, sumFast, 0.00000001);
        }
    }
}
The SIMD version needs around 70% more ticks compared to the nonSIMD version. I am running a Haswell architecture and imho. FMA3 should be implemented! (Release build, x64 prefered).
Any ideas? Thanks guys!
 
     
    