From one of our client's requirement, I have to develop an application which should be able to process huge CSV files. File size could be in the range of 10 MB - 2GB in size.
Depending on size, module decides whether to read the file using Multiprocessing pool or by using normal CSV reader.
But from observation, multi processing taking longer time than normal CSV reading when tested both the modes for a file with size of 100 MB.
Is this correct behaviour? OR Am I doing something wrong?
Here is my code:
def set_file_processing_mode(self, fpath):
""" """
fsize = self.get_file_size(fpath)
if fsize > FILE_SIZE_200MB:
self.read_in_async_mode = True
else:
self.read_in_async_mode = False
def read_line_by_line(self, filepath):
"""Reads CSV line by line"""
with open(filepath, 'rb') as csvin:
csvin = csv.reader(csvin, delimiter=',')
for row in iter(csvin):
yield row
def read_huge_file(self, filepath):
"""Read file in chunks"""
pool = mp.Pool(1)
for chunk_number in range(self.chunks): #self.chunks = 20
proc = pool.apply_async(read_chunk_by_chunk,
args=[filepath, self.chunks, chunk_number])
reader = proc.get()
yield reader
pool.close()
pool.join()
def iterate_chunks(self, filepath):
"""Read huge file rows"""
for chunklist in self.read_huge_file(filepath):
for row in chunklist:
yield row
@timeit #-- custom decorator
def read_csv_rows(self, filepath):
"""Read CSV rows and pass it to processing"""
if self.read_in_async_mode:
print("Reading in async mode")
for row in self.iterate_chunks(filepath):
self.process(row)
else:
print("Reading in sync mode")
for row in self.read_line_by_line(filepath):
self.process(row)
def process(self, formatted_row):
"""Just prints the line"""
self.log(formatted_row)
def read_chunk_by_chunk(filename, number_of_blocks, block):
'''
A generator that splits a file into blocks and iterates
over the lines of one of the blocks.
'''
results = []
assert 0 <= block and block < number_of_blocks
assert 0 < number_of_blocks
with open(filename) as fp :
fp.seek(0,2)
file_size = fp.tell()
ini = file_size * block / number_of_blocks
end = file_size * (1 + block) / number_of_blocks
if ini <= 0:
fp.seek(0)
else:
fp.seek(ini-1)
fp.readline()
while fp.tell() < end:
results.append(fp.readline())
return results
if __name__ == '__main__':
classobj.read_csv_rows(sys.argv[1])
Here is a test:
$ python csv_utils.py "input.csv"
Reading in async mode
FINISHED IN 3.75 sec
$ python csv_utils.py "input.csv"
Reading in sync mode
FINISHED IN 0.96 sec
Question is :
Why Async mode is taking longer?
NOTE: Removed unnecessary functions/lines to avoid complexity in the code
