I'm still very new to SIMD intrinsics and 2nd attempt with it thru Cython. After some help from people on here (many thanks), I have this:
from libc.stdlib cimport malloc, free, calloc
cdef extern from "immintrin.h":  # in this example, we use AVX2
    ctypedef float  __m256
    __m256 _mm256_loadu_ps  (float *__P) nogil  
    __m256 _mm256_add_ps    (__m256 __A, __m256 __B) nogil
    __m256 _mm256_mul_ps    (__m256 __A, __m256 __B) nogil
    __m256 _mm256_fmadd_ps  (__m256 __A, __m256 __B, __m256 __C) nogil
    void   _mm256_storeu_ps (float *__P, __m256 __A) nogil  
cdef float [::1] Example_v2 (float *A, float *B, float *C, int n) :
    ### this example for A and B having more than 8 elements, possibly non-divisible by 8    
    cdef:
        __m256 mA, mB, mC
        float *out = <float*> malloc( n * sizeof(float)) 
        float [::1] out_mem = <float [:n]> out
        int i, j, m = <int>( (n-1) / 8) + 1
    with nogil:
        for i in range(m):
            j = 8*i
            mA = _mm256_loadu_ps( &A[j] )
            mB = _mm256_loadu_ps( &B[j] )
            mC = _mm256_loadu_ps( &C[j] )
            _mm256_storeu_ps( &out[j] , _mm256_fmadd_ps( mA, mB, mC ) )
    return out_mem
def run2(float [::1] A, float [::1] B, float [::1] C):
    return Example_v2( &A[0] , &B[0] , &C[0], A.shape[0] )
If n is divisible by 8. numpy A*B+C and cython is identical and that's the ONLY good news.
If n is non-divisible by 8. It can only run once, gives correct answer and then my Spyder will pop up the message below. Second run, without changing anything will result in stall/hang or have same repetition as first run. I don't know how else to fix it.
Another related question: Is there a better way to write the section under the "with nogil" ? Numpy still faster for N = 400, running thru above in loop (100,000). Sigh...
  File "C:\Users\beng_\Anaconda3\lib\site-packages\psutil\_pswindows.py", line 679, in wrapper
    return fun(self, *args, **kwargs)
  File "C:\Users\beng_\Anaconda3\lib\site-packages\psutil\_pswindows.py", line 933, in create_time
    user, system, created = cext.proc_times(self.pid)
ProcessLookupError: [Errno 3] No such process (originated from GetExitCodeProcess != STILL_ACTIVE)
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
  File "C:\Users\beng_\Anaconda3\lib\site-packages\psutil\__init__.py", line 373, in _init
    self.create_time()
  File "C:\Users\beng_\Anaconda3\lib\site-packages\psutil\__init__.py", line 723, in create_time
    self._create_time = self._proc.create_time()
  File "C:\Users\beng_\Anaconda3\lib\site-packages\psutil\_pswindows.py", line 681, in wrapper
    raise convert_oserror(err, pid=self.pid, name=self._name)
psutil.NoSuchProcess: psutil.NoSuchProcess process no longer exists (pid=9296)
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
  File "C:\Users\beng_\Anaconda3\lib\site-packages\qtconsole\manager.py", line 27, in poll
    super(QtKernelRestarter, self).poll()
  File "C:\Users\beng_\Anaconda3\lib\site-packages\jupyter_client\restarter.py", line 113, in poll
    self.kernel_manager.restart_kernel(now=True, newports=newports)
  File "C:\Users\beng_\Anaconda3\lib\site-packages\jupyter_client\manager.py", line 411, in restart_kernel
    self.shutdown_kernel(now=now, restart=True)
  File "C:\Users\beng_\Anaconda3\lib\site-packages\jupyter_client\manager.py", line 371, in shutdown_kernel
    self._kill_kernel()
  File "C:\Users\beng_\Anaconda3\lib\site-packages\spyder\plugins\ipythonconsole\utils\manager.py", line 78, in _kill_kernel
    self.kill_proc_tree(self.kernel.pid)
  File "C:\Users\beng_\Anaconda3\lib\site-packages\spyder\plugins\ipythonconsole\utils\manager.py", line 44, in kill_proc_tree
    parent = psutil.Process(pid)
  File "C:\Users\beng_\Anaconda3\lib\site-packages\psutil\__init__.py", line 346, in __init__
    self._init(pid)
  File "C:\Users\beng_\Anaconda3\lib\site-packages\psutil\__init__.py", line 386, in _init
    raise NoSuchProcess(pid, None, msg)
psutil.NoSuchProcess: psutil.NoSuchProcess no process found with pid 9296
 
    