useI am working on a python script to check if the url is working. The script will write the url and response code to a log file. To speed up the check, I am using threading and queue.
The script works well if the number of url's to check is small but when increasing the number of url's to hundreds, some url's just will miss from the log file. Is there anything I need to fix?My script is
#!/usr/bin/env python
import Queue
import threading
import urllib2,urllib,sys,cx_Oracle,os
import time
from urllib2 import HTTPError, URLError
queue = Queue.Queue()
##print_queue = Queue.Queue()
class NoRedirectHandler(urllib2.HTTPRedirectHandler):
    def http_error_302(self, req, fp, code, msg, headers):
        infourl = urllib.addinfourl(fp, headers, req.get_full_url())
        infourl.status = code
        infourl.code = code
        return infourl
    http_error_300 = http_error_302
    http_error_301 = http_error_302
    http_error_303 = http_error_302
    http_error_307 = http_error_302
class ThreadUrl(threading.Thread):
    #Threaded Url Grab
##    def __init__(self, queue, print_queue):
    def __init__(self, queue,error_log):    
        threading.Thread.__init__(self)
        self.queue = queue
##        self.print_queue = print_queue
        self.error_log = error_log
    def do_something_with_exception(self,idx,url,error_log):
        exc_type, exc_value = sys.exc_info()[:2]
##        self.print_queue.put([idx,url,exc_type.__name__])
        with open( error_log, 'a') as err_log_f:
            err_log_f.write("{0},{1},{2}\n".format(idx,url,exc_type.__name__))
    def openUrl(self,pair):
        try:
            idx = pair[1]
            url = 'http://'+pair[2]
            opener = urllib2.build_opener(NoRedirectHandler())
            urllib2.install_opener(opener)
            request = urllib2.Request(url)
            request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 5.1; rv:13.0) Gecko/20100101 Firefox/13.0.1')
            #open urls of hosts 
            resp = urllib2.urlopen(request, timeout=10)
##            self.print_queue.put([idx,url,resp.code])
            with open( self.error_log, 'a') as err_log_f:
                err_log_f.write("{0},{1},{2}\n".format(idx,url,resp.code))
        except:
            self.do_something_with_exception(idx,url,self.error_log)    
    def run(self):
        while True:
            #grabs host from queue
            pair = self.queue.get()
            self.openUrl(pair)
            #signals to queue job is done
            self.queue.task_done()
def readUrlFromDB(queue,connect_string,column_name,table_name):
    try:  
        connection = cx_Oracle.Connection(connect_string)
        cursor = cx_Oracle.Cursor(connection)
        query = 'select ' + column_name + ' from ' + table_name
        cursor.execute(query)
        #Count lines in the file
        rows = cursor.fetchall()
        total = cursor.rowcount        
        #Loop through returned urls
        for row in rows:
            #print row[1],row[2]
##            url = 'http://'+row[2]
            queue.put(row)
        cursor.close()
        connection.close()
        return total
    except cx_Oracle.DatabaseError, e:
        print e[0].context
        raise   
def main():   
    start = time.time()
    error_log = "D:\\chkWebsite_Error_Log.txt"
    #Check if error_log file exists
    #If exists then deletes it
    if os.path.isfile(error_log):  
         os.remove(error_log)
    #spawn a pool of threads, and pass them queue instance 
    for i in range(10):
        t = ThreadUrl(queue,error_log)
        t.setDaemon(True)
        t.start()
    connect_string,column_name,table_name = "user/pass@db","*","T_URL_TEST"
    tn = readUrlFromDB(queue,connect_string,column_name,table_name)
   #wait on the queue until everything has been processed     
    queue.join()
##    print_queue.join()
    print "Total retrived: {0}".format(tn)
    print "Elapsed Time: %s" % (time.time() - start)
main()
 
     
     
    