Can someone tell me how to assign jobs to multiple threads to speed up parsing time? For example, I have XML file with 200k lines, I would assign 50k lines to each 4 threads and parse them using SAX parser. What I have done so far is 4 threads parsing on 200k lines which means 200k*4 = 800k duplicating results.
Any help is appreciated.
test.xml:
<?xml version="1.0" encoding="utf-8"?>
<votes>
  <row Id="1" PostId="1" VoteTypeId="2" CreationDate="2014-05-13T00:00:00.000" />
  <row Id="2" PostId="1" VoteTypeId="2" CreationDate="2014-05-13T00:00:00.000" />
  <row Id="3" PostId="3" VoteTypeId="2" CreationDate="2014-05-13T00:00:00.000" />
  <row Id="5" PostId="3" VoteTypeId="2" CreationDate="2014-05-13T00:00:00.000" />
</votes>
My source code:
import json  
import xmltodict  
from lxml import etree
import xml.etree.ElementTree as ElementTree
import threading
import time
def sax_parsing():
    t = threading.currentThread()
    for event, element in etree.iterparse("/home/xiang/Downloads/FYP/parallel-python/test.xml"):
        #below codes read the attributes in an element specified
        if element.tag == 'row':
            print("Thread: %s" % t.getName())
            row_id = element.attrib.get('Id')
            row_post_id = element.attrib.get('PostId')
            row_vote_type_id = element.attrib.get('VoteTypeId')
            row_user_id = element.attrib.get('UserId')
            row_creation_date = element.attrib.get('CreationDate')
            print('ID: %s, PostId: %s, VoteTypeID: %s, UserId: %s, CreationDate: %s'% (row_id,row_post_id,row_vote_type_id,row_user_id,row_creation_date))
            element.clear()  
    return
if __name__ == "__main__":  
    start = time.time() #calculate execution time
    main_thread = threading.currentThread()
    no_threads = 4
    for i in range(no_threads):
        t = threading.Thread(target=sax_parsing)
        t.start()
    for t in threading.enumerate():
        if t is main_thread:
            continue
    t.join()
    end = time.time() #calculate execution time
    exec_time = end - start
    print('Execution time: %fs' % (exec_time))