I'm writing a crawler, which will get data from an e-commerce website, using node.js. Each of my input to fetch contains:
- url: URL of that link
- directory: Directory name into which the output file should be written later
- page: Parameter to query
Each page will fetch a number of items, each of them will be fetched in details later
This is my fetchPage promise (agent is require('superagent')) that will fetch HTML text:
function fetchPage(url,page){
    return new Promise(
        (resolve,reject)=>{
            if (page>0){
                agent
                .get(url)
                .send('page='+page)
                .end(function(err,res){
                    if (err){
                        reject(err);
                    } else{
                        resolve(res.text);
                    }
                });
            } else{
                agent
                .get(url)
                .end(function(err,res){
                    if (err){
                        reject(err);
                    } else{
                        resolve(res.text);
                    }
                });
            }
        });
}
Global calls:
var data=[];
for (var i=1;i<=links[0].numOfPages;i++){
    data.push({
        url:links[0].url,
        directory:links[0].directory,
        page:i
    });
}
const promises=data.reduce(
    (promise,data)=>promise.then(()=>{
        fetchPage(data.url,data.page).then(
            (result)=>{
                const urls=getUrls(result);
                Promise.all(urls.map((url,i)=>fetchPage(url,0).then(
                        (result)=>{
                            var item=getItem(result);
                            item.url=url;
                            writeItem(item,data.directory,data.page,i+1);
                        },
                        (error)=>console.log(error)
                )));
            });
    }),
    Promise.resolve());
promises.then((values)=>console.log('All done'));
There are 3 functions you will see as utilities (all of them work properly):
- getUrls: Process HTML text of a page, returning an array of urls of items to crawl in details later
- getItem: Process HTML text of an item's detailed page, returning an object that will be written into file
- writeItem: Write an object to file, provided with directory and page number to make proper directory and write and store
There is a problem I have been encountering:
- How can I rebuild it using a queue of promises in which each promise will run one-by-one and one-after-another orderly and synchronously and only allows a limited number of promises running concurrently?
How to do it properly and efficiently? How should I change with these current code? I need some demo also
I deleted fetchItem function because of its innecessity (actually, it calls fetchPage with page = 0), now I only utilize fetchPage
 
    