Using the DOM, you could use document.Node.textContent. However, NodeJs doesn't have textContent (since it doesn't have native access to the DOM), therefore you should use external packages. You could install request and cheerio, using npm. cheerio, suggested by Jon Church, is maybe the easiest web scraping tool to use (there are also complexer ones like jsdom)
With power of cheerio and request in your hands, you could write
const request = require("request");
const cheerio = require("cheerio");
const fs = require("fs");
//taken from https://stackoverflow.com/a/19709846/10713877
function is_absolute(url)
{
var r = new RegExp('^(?:[a-z]+:)?//', 'i');
return r.test(url);
}
function is_local(url)
{
var r = new RegExp('^(?:file:)?//', 'i');
return (r.test(url) || !is_absolute(url));
}
function send_request(URL)
{
if(is_local(URL))
{
if(URL.slice(0,7)==="file://")
url_tmp = URL.slice(7,URL.length);
else
url_tmp = URL;
//taken from https://stackoverflow.com/a/20665078/10713877
const $ = cheerio.load(fs.readFileSync(url_tmp));
//Do something
console.log($.text())
}
else
{
var options = {
url: URL,
headers: {
'User-Agent': 'Your-User-Agent'
}
};
request(options, function(error, response, html) {
//no error
if(!error && response.statusCode == 200)
{
console.log("Success");
const $ = cheerio.load(html);
return Promise.resolve().then(()=> {
//Do something
console.log($.text())
});
}
else
{
console.log(`Failure: ${error}`);
}
});
}
}
Let me explain the code. You pass a URL to send_request function. It checks whether the URL string is a path to your local file, (a relative path, or a path starting with file://). If it is a local file, it proceeds to use cheerio module, otherwise, it has to send a request, to the website, using the request module, then use cheerio module. Regular Expressions are used in is_absolute and is_local. You get the text using text() method provided by cheerio. Under the comments //Do something, you could do whatever you want with the text.
There are websites that let you know 'Your-User-Agent', copy-paste your user agent to that field.
Below lines will work
//your local file
send_request("/absolute/path/to/your/local/index.html");
send_request("/relative/path/to/your/local/index.html");
send_request("file:///absolute/path/to/your/local/index.html");
//website
send_request("https://stackoverflow.com/");
EDIT: I am on a linux system.