I am trying to scrape for data from an array of wesbite urls which are given to me. I only want to fetch the title, the image and the body of the HTML page. These are the steps that I am following:
- I am trying to get the HTML page (in string format) of each of the urlsand store them in an array:
async function getdata(value){
    let data = "";
    let txtPromise;
    if(value.includes('https')){
        txtPromise = await fetch('https://api.codetabs.com/v1/proxy?quest='+value);
        if (txtPromise.ok) {
           data = await txtPromise.text();
        }
        else{
           data = "not found";
        }
    }
    else
        data = "invalid/null url";
    return data;
}
let urls = ["https://.......","https://......","https://...."];
for(var i=0; i<urls.length; i++){ 
    responses.push(getdata(urls[i]));
}
- After fetching the documents, I am trying to HTML parse the documents and retrieve the title,imageand thebodyof each of the documents (Theimagecan be any image, so I am just fetching the first image that I get in the page):
function saveimagetosystem(url){
   return $.ajax({
            url: '/server.php',
            type: 'POST',
            data:{"input":"save image","url":url},
            cache:false,
        })
}
async function processhtmldata(html_data,index){
    if(!html_data.includes("not found") && !html_data.includes("invalid/null url")){
        var parser = new DOMParser();
        var htmldoc = parser.parseFromString(html_data, "text/html");
        var title = htmldoc.querySelector("title").innerText;
        details["title"+" "+index]=title.trim();
        if(htmldoc.getElementsByTagName("img") != null){         
            var url = htmldoc.getElementsByTagName("img")[0].src.trim();
            var response = await saveimagetosystem('https://api.codetabs.com/v1/proxy?quest='+url);
            details["imeg"+" "+index] = response; //I am saving the image into a folder, and fetching the file location here
            if(htmldoc.getElementsByTagName("p") != null){
                for(var l=0; l<htmldoc.getElementsByTagName("p").length-1; l++){                                 
                    details["body"+" "+index+" "+l] =  htmldoc.getElementsByTagName("p")[l].innerText.trim();
                }
            }
        }
    }
}
let sl_no = 0;
Promise.all(responses)
   .then( htmlfiles =>{
       htmlfiles.forEach(file=>{
           processhtmldata(file,sl_no);
           sl_no+=1;
       })                                                             
   })
Now, my objective here is that my details object should look like this:
{"title 0": Should contain the title of the HTML document page of the FIRST url,
 "imeg 0": Should contain the image of the HTML document page of the FIRST url,
 "body 0 0": Should contain the first body of the HTML document page of the FIRST url,
 "body 0 1": Should contain the second body of the HTML document page of the FIRST url,...
 
 "title 1": Should contain the title of the HTML document page of the SECOND url,
 "imeg 1": Should contain the image of the HTML document page of the SECOND url,
 "body 1 0": Should contain the first body of the HTML document page of the SECOND url,
 "body 1 1": Should contain the second body of the HTML document page of the SECOND url,...
 "title 2": Should contain the title of the HTML document page of the THIRD url,
 "imeg 2": Should contain the image of the HTML document page of the THIRD url,
 "body 2 0": Should contain the first body of the HTML document page of the THIRD url,
 "body 2 1": Should contain the second body of the HTML document page of the THIRD url,...
}
But instead, my details object looks like this:
{"title 0": Contains the title of the HTML document page of the SECOND url,
 "imeg 0": Contains the image of the HTML document page of the THIRD url,
 "body 0 0": Contains the first body of the HTML document page of the FIRST url,
 "body 0 1": Contains the second body of the HTML document page of the FIRST url, and so on....
}
Why am I not receiving the response in a synchronous manner even after using await? Please help me.
