I can't figure out why this crashes when called from Python.  It is just a simple Cython code to call Intel MKL's vdMul function https://software.intel.com/en-us/mkl-developer-reference-c-v-mul.  I've tried copying every DLL from MKL into the directory and rewriting different parts but it keeps crashing although compiles fine.  Posting here as I probably made an obvious error to someone more experienced working in C++.  Here's the PYX code:
import numpy as np
cimport numpy as np
cimport cython
from cython cimport view
cdef extern from "mkl.h" nogil:
    double* vect_mult "vdMul"(int n,
                          double *a, 
                          double *b,
                          double *y) 
@cython.boundscheck(False)
@cython.wraparound(False)
@cython.cdivision(True)
cpdef mult(double[::1] A, double[::1] B, double[:,::1] output):
    cdef int Ashape0=A.shape[0], Bshape0=B.shape[0]
    cdef int N = Ashape0*Bshape0
    with nogil:
        vect_mult(N, &A[0], &B[0], &output[0,0])
#test script
from cyblas import mult
import numpy as np
a=np.random.randn(1000)
b=np.random.randn(1000)
output = np.zeros((a.shape[0],b.shape[0]))
mult(a,b,output)