I tried to implement my own Vector struct with the new hardware intrinsics support in .NET Core 3 using C#. This is the struct I wrote:
[StructLayout(LayoutKind.Sequential)]
public struct Vector4
{
    public float X;
    public float Y;
    public float Z;
    public float W;
    public Vector4(float x, float y, float z, float w)
    {
        X = x;
        Y = y;
        Z = z;
        W = w;
    }
    [MethodImpl(MethodImplOptions.AggressiveInlining)]
    public static float Dot(Vector4 a, Vector4 b)
    {
        return a.X * b.X + a.Y * b.Y + a.Z * b.Z + a.W * b.W;
    }
    [MethodImpl(MethodImplOptions.AggressiveInlining)]
    public unsafe static float DotSse(Vector4 a, Vector4 b)
    {
        var _a_128 = Sse.LoadVector128((float*)&a);
        var _b_128 = Sse.LoadVector128((float*)&b);
        var _result_128 = Sse41.DotProduct(_a_128, _b_128, 0xF1);
        float result;
        Sse.StoreScalar(&result, _result_128);
        return result;
    }
}
However, after doing some Benchmarks using BenchmarkDotNet it turned out that my SSE version of the dot product is much slower than without:
    [Benchmark]
    [ArgumentsSource(nameof(Vectors))]
    public void Dot(Vector4 a, Vector4 b)
    {
        Vector4.Dot(a, b);
    }
    [Benchmark]
    [ArgumentsSource(nameof(Vectors))]
    public void DotSse(Vector4 a, Vector4 b)
    {
        Vector4.DotSse(a, b);
    }
Result on my notebook with Intel i7-9750H
| Method |           a |           b |      Mean |     Error |    StdDev |
|------- |------------ |------------ |----------:|----------:|----------:|
|    Dot | SSE.Vector4 | SSE.Vector4 | 0.0466 ns | 0.0258 ns | 0.0201 ns |
| DotSse | SSE.Vector4 | SSE.Vector4 | 0.6555 ns | 0.0286 ns | 0.0254 ns |
Now I was wondering if I did something wrong with my implementation of DotSse?
