I'm trying to create a node js web scraper. The overall operation of this scraper is:
- Grab array of URLs from database. Return in a promise.
 - Send Requests to URL from database and scrape data. Return in a promise
 - Insert scraped data into database.
 
I want to be able to compose my steps like so.
getUrls()
  .then(scrapeData)
  .then(insertData);
However, I'm finding that in order to do this, I must wait for ALL data from each url to resolve within step 2 (using promise.all) in order to proceed to the next chained event.
This could pose problems because I could be sending requests to thousands of URLS and if one fails during promise.all, all of the data gathered is then lost.
I would much rather have each function operate like so:
getUrls() //grab array of all urls (could be thousands)
  .then(scrapeData) // for each url scrape data and immediately proceed to chained function
  .then(insertData);
In short, is there a procedural way to iterate through the chain of a promise and control when data is to be waited for?
My Code:
var express = require('express');
var app = express();
var request = require('request');
var cheerio = require('cheerio');
app.get('/', (req, res) => {
    var sql = require("mssql");
    // config for your database
    var config = {
        user: '',
        password: '',
        server: '',
        database: '',
        options: {
            encrypt: false // Use this if you're on Windows Azure 
        }
    } 
    const getSkus = () => {
        var promise = new Promise((resolve, reject) => {
            sql.connect(config, (err) => {
                if (err) console.log(err);
                // create Request object
                var request = new sql.Request();
                // query to the database and get the records
                request.query('SELECT URL FROM PRODUCTS, (err, recordset) => {
                    if (err) {
                        console.log("There was an error executing the SQL statement: " + err)
                        reject(err);
                    } else{
                    resolve(recordset);
                    }
                });
            });
         });
        return promise;
    }
    const urlGen = (skus) => {
        var base_url = 'http://somesite.com/search/?q='
        var urls = [];
        skus.forEach((sku) =>{
            let code = sku.Code;
            let mpn = sku.MPN;
            let url = base_url + mpn;
            urls.push(url);
        });
        return urls;
    }
    const makeRequests = (urls) => {
        var promises = [];
        urls.forEach((url) => {
            var promise = new Promise((resolve, reject) => {
                request(url, (err, response, html) => {
                    if(!err && response.statusCode == 200){
                            //do scraping here
                            }
                            catch(err){
                                reject(err);
                                console.log('Error occured during data scraping:');
                            }
                            resolve(jsontemp);
                        }
                        else{
                            reject(err);
                        }
                });
            });
            promises.push(promise);
        });
        return Promise.all(promises);
    }
    getSkus()
        .then(urlGen)
        .then(makeRequests)
        .catch((e) => console.log(e));
});
var server = app.listen(5000, function () {
    console.log('Server is running..');
});