I am looking for a fast way to compute a rolling-sum, possibly using Numpy. Here is my first approach:
 def func1(M, w):
     Rtn = np.zeros((M.shape[0], M.shape[1]-w+1))
     for i in range(M.shape[1]-w+1):
         Rtn[:,i] = np.sum(M[:, i:w+i], axis=1)
     return Rtn
 M = np.array([[0.,  0.,  0.,  0.,  0.,  1.,  1.,  0.,  1.,  1.,  1.,  0.,  0.],
               [0.,  0.,  1.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  1.],
               [1.,  1.,  0.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.]])
 window_size = 4
 print func1(M, window_size)
 [[ 0.  0.  1.  2.  2.  3.  3.  3.  3.  2.]
  [ 1.  2.  2.  1.  1.  0.  0.  0.  1.  2.]
  [ 3.  2.  1.  1.  1.  1.  1.  1.  0.  0.]]
I wanted to prevent the window (/sum) from being redone in the loop and hopefully make it much faster so I came up with the following function which limits the sum to only the first and last elements of the rolling window:
 def func2(M, w):
     output = np.zeros((M.shape[0], M.shape[1]-w+1))
     sum = np.sum(M[:, 0:w], axis=1)
     output[:,0] = sum
     for i in range(w, M.shape[1]):
         sum = sum + M[:,i]- M[:,i-w]
         output[:,i-w+1] = sum
     return output
But to my surprise, func2 is barely faster than func1:
 In [251]:
 M = np.random.randint(2, size=3000).reshape(3, 1000)
 window_size = 100
 %timeit func1(M, window_size)
 10 loops, best of 3: 20.9 ms per loop
 In [252]:
 %timeit func2(M, w)
 10 loops, best of 3: 15.5 ms per loop
Am I missing something here? Do you guys know a better, I mean faster way to achieve this?
 
     
     
    