The following code is used to calculate FIR:
    void Fir(float* pIn, float* pOut, float* pCoeff, float* pStage, uint32_t N, uint32_t FilterLength)
{
    int n, k;
    float* pSrc;
    float* pCoeffSrc = pCoeff;
    float* pDst = pOut;
    float s0, s1, s2, s3;
    __m128 Vec, Mul;
    __m128 Sum0,Sum1,Sum2,Sum3;
    __m128 Zero = _mm_set_ps1(0);
    memcpy(&pStage[FilterLength - 1], pIn, N * sizeof(float));
    for (n = 0; n < N; n+=4)
    {
        //Sum0
        pSrc = &pStage[n];
        Sum0 = _mm_set_ps1(0);
        pCoeffSrc = pCoeff;
        for (k = 0; k < FilterLength >> 2; k++)
        {
            __m128 Coeff = _mm_load_ps(pCoeffSrc);
            Vec = _mm_load_ps(pSrc); 
            Sum0  = _mm_fmadd_ps(Coeff, Vec, Sum0);
            pCoeffSrc += 4;
            pSrc += 4;
        }
        Sum0 = _mm_hadd_ps(Sum0, Zero);
        Sum0 = _mm_hadd_ps(Sum0, Zero);
        //Sum1
        pSrc = &pStage[n+1];
        Sum1 = _mm_set_ps1(0);
        pCoeffSrc = pCoeff;
        for (k = 0; k < FilterLength >> 2; k++)
        {
            __m128 Coeff = _mm_load_ps(pCoeffSrc);
            Vec = _mm_load_ps(pSrc);
            Sum1 = _mm_fmadd_ps(Coeff, Vec, Sum1);
            pCoeffSrc += 4;
            pSrc += 4;
        }
        Sum1 = _mm_hadd_ps(Sum1, Zero);
        Sum1 = _mm_hadd_ps(Sum1, Zero);
        //Sum2
        pSrc = &pStage[n+2];
        Sum2 = _mm_set_ps1(0);
        pCoeffSrc = pCoeff;
        for (k = 0; k < FilterLength >> 2; k++)
        {
            __m128 Coeff = _mm_load_ps(pCoeffSrc);
            Vec = _mm_load_ps(pSrc);
            Sum2 = _mm_fmadd_ps(Coeff, Vec, Sum2);
            pCoeffSrc += 4;
            pSrc += 4;
        }
        Sum2 = _mm_hadd_ps(Sum2, Zero);
        Sum2 = _mm_hadd_ps(Sum2, Zero);
        //Sum3
        pSrc = &pStage[n+3];
        Sum3 = _mm_set_ps1(0);
        pCoeffSrc = pCoeff;
        for (k = 0; k < FilterLength >> 2; k++)
        {
            __m128 Coeff = _mm_load_ps(pCoeffSrc);
            Vec = _mm_load_ps(pSrc);
            Sum3 = _mm_fmadd_ps(Coeff, Vec, Sum3);
            pCoeffSrc += 4;
            pSrc += 4;
        }
        Sum3 = _mm_hadd_ps(Sum3, Zero);
        Sum3 = _mm_hadd_ps(Sum3, Zero);
        Vec = _mm_set_ps(Sum3.m128_f32[0], Sum2.m128_f32[0], Sum1.m128_f32[0], Sum0.m128_f32[0]);
        _mm_store_ps(pDst, Vec);
        pDst+=4;
    }
}
The result of the each inner loop (4) is a scalar sum of a vector. Then I create a vector from 4 scalars by:
Vec = _mm_set_ps(Sum3.m128_f32[0], Sum2.m128_f32[0], Sum1.m128_f32[0], Sum0.m128_f32[0]);
Vec is stored in RAM by: _mm_store_ps(pDst, Vec);
Can I optimize this code ?
Thank you, Zvika