I want to implement some fast convex analysis operations - proximal operators and the like. I'm new to Cython and thought that this would be the right tool for the job. I have implementations both in pure Python and in Cython (mwe_py.py and mwe_c.pyx below). However, when I compare them, the Python + Numpy version is significantly faster than the Cython version. Why is this? I have tried using memoryviews, which are supposed to allow for faster indexing/operations; however, the performance difference is very pronounced! Any advice on how to fix mwe_c.pyx below to approach "optimal" Cython code would be greatly appreciated.
import pyximport; pyximport.install(language_level=3)
import mwe_c
import mwe_py
import numpy as np
from time import time
n = 100000
nreps = 10000
x = np.random.randn(n)
z = np.random.randn(n)
tau = 1.0
t0 = time()
for _ in range(nreps):
    out = mwe_c.prox_translation(mwe_c.prox_two_norm, x, z, tau)
t1 = time()
print(t1 - t0)
t0 = time()
for _ in range(nreps):
    out = mwe_py.prox_translation(mwe_py.prox_two_norm, x, z, tau)
t1 = time()
print(t1 - t0)
which gives the outputs, respectively:
10.76103401184082  # (seconds)
5.988733291625977  # (seconds)
Below is mwe_py.py:
import numpy.linalg as la
def proj_two_norm(x):
    """projection onto l2 unit ball"""
    X = la.norm(x)
    if X <= 1:
        return x
    return x / X
def prox_two_norm(x, tau):
    """proximal mapping of l2 norm with parameter tau"""
    return x - proj_two_norm(x / tau)
def prox_translation(prox_func, x, z, tau=None):
  """Compute prox(f(. - z))(x) where prox_func(x, tau) is prox(tau * f)(x)."""
  if tau is None:
      tau = 1.0
  return z + prox_func(x - z, tau)
And here, at last, is mwe_c.pyx:
import numpy as np
cimport numpy as np
cdef double [::1] aasubtract(double [::1] x, double [::1] z):
    cdef unsigned int i, m = len(x), n = len(z);
    assert m == n, f"vectors must have the same length"
    cdef double [::1] out = np.copy(x);
    for i in range(n):
        out[i] -= z[i]
    return out
cdef double [::1] vsdivide(double [::1] x, double tau):
    """Divide an array by a scalar element-wise."""
    cdef:
        unsigned int i, n = len(x);
        double [::1] out = np.copy(x);
    for i in range(n):
        out[i] /= tau
    return out
cdef double two_norm(double [::1] x):
    cdef:
        double out = 0.0;
        unsigned int i, n=len(x);
    for i in range(n):
        out = out + x[i]**2
    out = out **.5
    return out
cdef double [::1] proj_two_norm(double [::1] x):
    """project x onto the unit two ball."""
    cdef double x_norm = two_norm(x);
    cdef unsigned int i, n = len(x);
    cdef double [::1] p = np.copy(x);
    if x_norm <= 1:
        return p
    for i in range(n):
        p[i] = p[i] / x_norm
    return p
cpdef double [::1] prox_two_norm(double [::1] x, double tau):
    """double [::1] prox_two_norm(double [::1] x, double tau)"""
    cdef unsigned int i, n = len(x);
    cdef double [::1] out = x.copy(), Px = x.copy();
    Px = proj_two_norm(vsdivide(Px, tau));
    for i in range(n):
        out[i] = out[i] - Px[i]
    return out
cpdef prox_translation(
    prox_func,
    double [::1] x,
    double [::1] z,
    double tau=1.0
):
    cdef:
        unsigned int i, n = len(x);
        double [::1] out = prox_func(aasubtract(x, z), tau);
    for i in range(n):
        out[i] += z[i];
    return out