I'm using Node.js - async & request module to crawl 100+ millions of websites and I keep bumping into errors ESOCKETTIMEDOUT & ETIMEDOUT after few minutes.
It works again after I restart the script. It doesn't seem to be connection limit issue because I can still do resolve4, resolveNs, resolveMx and also curl without delay.
Do you see any issue with the code? or any advice? I'd like to push up the async.queue() concurrency to at least a 1000. Thank you.
var request = require('request'),
    async = require('async'),
    mysql = require('mysql'),
    dns = require('dns'),
    url = require('url'),
    cheerio = require('cheerio'),
    iconv = require('iconv-lite'),
    charset = require('charset'),
    config = require('./spy.config'),
    pool = mysql.createPool(config.db);
iconv.skipDecodeWarning = true;
var queue = async.queue(function (task, cb) {
    dns.resolve4('www.' + task.domain, function (err, addresses) {
        if (err) {
            //
            // Do something
            //
            setImmediate(function () {
                cb()
            });
        } else {
            request({
                url: 'http://www.' + task.domain,
                method: 'GET',
                encoding:       'binary',
                followRedirect: true,
                pool:           false,
                pool:           { maxSockets: 1000 },
                timeout:        15000 // 15 sec
            }, function (error, response, body) {
                //console.info(task);
                if (!error) {
                  // If ok, do something
                } else {
                    // If not ok, do these
                    console.log(error);
                    // It keeps erroring here after few minutes, resolve4, resolveNs, resolveMx still work here.
                    // { [Error: ETIMEDOUT] code: 'ETIMEDOUT' }
                    // { [Error: ESOCKETTIMEDOUT] code: 'ESOCKETTIMEDOUT' }
                    var ns = [],
                        ip = [],
                        mx = [];
                    async.parallel([
                        function (callback) {
                            // Resolves the domain's name server records
                            dns.resolveNs(task.domain, function (err, addresses) {
                                if (!err) {
                                    ns = addresses;
                                }
                                callback();
                            });
                        }, function (callback) {
                            // Resolves the domain's IPV4 addresses
                            dns.resolve4(task.domain, function (err, addresses) {
                                if (!err) {
                                    ip = addresses;
                                }
                                callback();
                            });
                        }, function (callback) {
                            // Resolves the domain's MX records
                            dns.resolveMx(task.domain, function (err, addresses) {
                                if (!err) {
                                    addresses.forEach(function (a) {
                                        mx.push(a.exchange);
                                    });
                                }
                                callback();
                            });
                        }
                    ], function (err) {
                        if (err) return next(err);
                        // do something
                    });
                }
                setImmediate(function () {
                    cb()
                });
            });
        }
    });
}, 200);
// When the queue is emptied we want to check if we're done
queue.drain = function () {
    setImmediate(function () {
        checkDone()
    });
};
function consoleLog(msg) {
    //console.info(msg);
}
function checkDone() {
    if (queue.length() == 0) {
        setImmediate(function () {
            crawlQueue()
        });
    } else {
        console.log("checkDone() not zero");
    }
}
function query(sql) {
    pool.getConnection(function (err, connection) {
        if (!err) {
            //console.log(sql);
            connection.query(sql, function (err, results) {
                connection.release();
            });
        }
    });
}
function crawlQueue() {
    pool.getConnection(function (err, connection) {
        if (!err) {
            var sql = "SELECT * FROM domain last_update < (UNIX_TIMESTAMP() - 2592000) LIMIT 500";
            connection.query(sql, function (err, results) {
                if (!err) {
                    if (results.length) {
                        for (var i = 0, len = results.length; i < len; ++i) {
                            queue.push({"id": results[i]['id'], "domain": results[i]['domain'] });
                        }
                    } else {
                        process.exit();
                    }
                    connection.release();
                } else {
                    connection.release();
                    setImmediate(function () {
                        crawlQueue()
                    });
                }
            });
        } else {
            setImmediate(function () {
                crawlQueue()
            });
        }
    });
}
setImmediate(function () {
    crawlQueue()
});
And the system limits are pretty high.
    Limit                     Soft Limit           Hard Limit           Units
    Max cpu time              unlimited            unlimited            seconds
    Max file size             unlimited            unlimited            bytes
    Max data size             unlimited            unlimited            bytes
    Max stack size            8388608              unlimited            bytes
    Max core file size        0                    unlimited            bytes
    Max resident set          unlimited            unlimited            bytes
    Max processes             257645               257645               processes
    Max open files            500000               500000               files
    Max locked memory         65536                65536                bytes
    Max address space         unlimited            unlimited            bytes
    Max file locks            unlimited            unlimited            locks
    Max pending signals       257645               257645               signals
    Max msgqueue size         819200               819200               bytes
    Max nice priority         0                    0
    Max realtime priority     0                    0
    Max realtime timeout      unlimited            unlimited            us
sysctl
net.ipv4.ip_local_port_range = 10000    61000