I have 2 versions of a function that appends a row to a 2d array; one in Cython and another in Numba.
The performance of the Cython version is a lot slower than the Numba version. I would like to optimise the Cython version so that it performs atleast as well as the Numba version.
I am timing the code with this timer.py modules:
import time
class Timer(object):
    def __init__(self, name='', output=print):
        self._name = name
        self._output = output
    def __enter__(self):
        self.start = time.time()
        return self
    def __exit__(self, a, b, c):
        self.end = time.time()
        self.time_taken = self.end - self.start
        self._output('%s Took %0.2fs seconds' % (self._name, self.time_taken))
My append_2d_cython.pyx module is:
#!python
#cython: boundscheck=False
#cython: wraparound=False
import numpy as np
cimport numpy as cnp
cnp.import_array()  # needed to initialize numpy-API
cpdef empty_2d(int d1, int d2):
    cdef:
        cnp.npy_intp[2] dimdim
    dimdim[0] = d1
    dimdim[1] = d2
    return cnp.PyArray_SimpleNew(2, dimdim, cnp.NPY_INT32)
cpdef append_2d(int[:, :] arr, int[:] value):
    cdef int[:, :] result
    result = empty_2d(arr.shape[0]+1, arr.shape[1])
    result[:arr.shape[0], :] = arr
    result[arr.shape[0], :] = value
    return result
My append_2d_numba.py module is:
import numba as nb
import numpy as np
@nb.jit(nopython=True)
def append_2d(arr, value):
    result = np.empty((arr.shape[0]+1, arr.shape[1]), dtype=arr.dtype)
    result[:-1] = arr
    result[-1] = value
    return result
I am comparing the Numba and Cython versions of append_2d with this script:
import pyximport
import numpy as np
pyximport.install(setup_args={'include_dirs': np.get_include()})
from timer import Timer
from append_2d_cython import append_2d as append_2d_cython
from append_2d_numba import append_2d as append_2d_numba
arr_2d = np.random.randint(0, 100, size=(5, 4), dtype=np.int32)
arr_1d = np.array([0, 1, 2, 3], np.int32)
num_tests = 100000
with Timer('append_2d_cython'):
    for _ in range(num_tests):
        r_cython = append_2d_cython(arr_2d, arr_1d)
# # JIT Compile it
append_2d_numba(arr_2d, arr_1d)
with Timer('append_2d_numba'):
    for _ in range(num_tests):
        r_numba = append_2d_numba(arr_2d, arr_1d)
Which prints:
make many with cython Took 0.36s seconds
make many with numba Took 0.12s seconds
So, for this code, numba is 3 times faster than Cython. I would like to refactor the Cython code to be atleast as fast as the Numba code. How can I do that?
 
    