Here Matrix multiplication using hdf5 I use hdf5 (pytables) for big matrix multiplication, but I was suprised because using hdf5 it works even faster then using plain numpy.dot and store matrices in RAM, what is the reason of this behavior?
And maybe there is some faster function for matrix multiplication in python, because I still use numpy.dot for small block matrix multiplication.
here is some code:
Assume matrices can fit in RAM: test on matrix 10*1000 x 1000.
Using default numpy(I think no BLAS lib). Plain numpy arrays are in RAM: time 9.48
If A,B in RAM, C on disk: time 1.48
If A,B,C on disk: time 372.25
If I use numpy with MKL results are: 0.15,0.45,43.5.
Results looks reasonable, but I still don't understand why in 1st case block multiplication is faster(when we store A,B in RAM).
n_row=1000
n_col=1000
n_batch=10
def test_plain_numpy():
    A=np.random.rand(n_row,n_col)# float by default?
    B=np.random.rand(n_col,n_row)
    t0= time.time()
    res= np.dot(A,B)
    print (time.time()-t0)
#A,B in RAM, C on disk
def test_hdf5_ram():
    rows = n_row
    cols = n_col
    batches = n_batch
    #using numpy array
    A=np.random.rand(n_row,n_col)
    B=np.random.rand(n_col,n_row)
    #settings for all hdf5 files
    atom = tables.Float32Atom() #if store uint8 less memory?
    filters = tables.Filters(complevel=9, complib='blosc') # tune parameters
    Nchunk = 128  # ?
    chunkshape = (Nchunk, Nchunk)
    chunk_multiple = 1
    block_size = chunk_multiple * Nchunk
    #using hdf5
    fileName_C = 'CArray_C.h5'
    shape = (A.shape[0], B.shape[1])
    h5f_C = tables.open_file(fileName_C, 'w')
    C = h5f_C.create_carray(h5f_C.root, 'CArray', atom, shape, chunkshape=chunkshape, filters=filters)
    sz= block_size
    t0= time.time()
    for i in range(0, A.shape[0], sz):
        for j in range(0, B.shape[1], sz):
            for k in range(0, A.shape[1], sz):
                C[i:i+sz,j:j+sz] += np.dot(A[i:i+sz,k:k+sz],B[k:k+sz,j:j+sz])
    print (time.time()-t0)
    h5f_C.close()
def test_hdf5_disk():
    rows = n_row
    cols = n_col
    batches = n_batch
    #settings for all hdf5 files
    atom = tables.Float32Atom() #if store uint8 less memory?
    filters = tables.Filters(complevel=9, complib='blosc') # tune parameters
    Nchunk = 128  # ?
    chunkshape = (Nchunk, Nchunk)
    chunk_multiple = 1
    block_size = chunk_multiple * Nchunk
    fileName_A = 'carray_A.h5'
    shape_A = (n_row*n_batch, n_col)  # predefined size
    h5f_A = tables.open_file(fileName_A, 'w')
    A = h5f_A.create_carray(h5f_A.root, 'CArray', atom, shape_A, chunkshape=chunkshape, filters=filters)
    for i in range(batches):
        data = np.random.rand(n_row, n_col)
        A[i*n_row:(i+1)*n_row]= data[:]
    rows = n_col
    cols = n_row
    batches = n_batch
    fileName_B = 'carray_B.h5'
    shape_B = (rows, cols*batches)  # predefined size
    h5f_B = tables.open_file(fileName_B, 'w')
    B = h5f_B.create_carray(h5f_B.root, 'CArray', atom, shape_B, chunkshape=chunkshape, filters=filters)
    sz= rows/batches
    for i in range(batches):
        data = np.random.rand(sz, cols*batches)
        B[i*sz:(i+1)*sz]= data[:]
    fileName_C = 'CArray_C.h5'
    shape = (A.shape[0], B.shape[1])
    h5f_C = tables.open_file(fileName_C, 'w')
    C = h5f_C.create_carray(h5f_C.root, 'CArray', atom, shape, chunkshape=chunkshape, filters=filters)
    sz= block_size
    t0= time.time()
    for i in range(0, A.shape[0], sz):
        for j in range(0, B.shape[1], sz):
            for k in range(0, A.shape[1], sz):
                C[i:i+sz,j:j+sz] += np.dot(A[i:i+sz,k:k+sz],B[k:k+sz,j:j+sz])
    print (time.time()-t0)
    h5f_A.close()
    h5f_B.close()
    h5f_C.close()
 
     
    