Using Request.js and Cheerio.js in Node/Express return empty array Using Request.js and Cheerio.js in Node/Express return empty array express express

Using Request.js and Cheerio.js in Node/Express return empty array


Please refer to the below code for a working implementation

var request = require('request-promise')    var cheerio = require("cheerio")    function parseSites(urls, callback) {        var parsedSites = [];        var promiseList = urls.map(getPage)        Promise.all(promiseList).then(function (data) {            callback(data.map(parse))        })        return parsedSites;    }    function getPage(url) {        return request.get(url)    }    function parse(body) {        console.log("parsing body")        var $ = cheerio.load(body);        return $('title').text()    }    parseSites(['https://www.google.com','https://www.facebook.com'],function(data) {        console.log(data)    })


First you need to understand the difference between asynchronous and synchronous code. Lets see an example:

function testFor() {    for(let i=0;i<5;++i){        console.log(i);    }}

-

console.log('start:');testFor();console.log('end:');// Here you get the expected output because this code is synchronous.//output:    start:    0    1    2    3    4    end:

-

console.log('start:');setTimeout(testFor,1000);console.log('end:');// Here you don't get your expected output because setTimeout is asynchronous .//output:    start:    end:    0    1    2    3    4
  1. First the console.log('start:'); is called.
  2. Then setTimeout(testFor,1000); (but it is async and the callwill execute in 1 second).
  3. Immediately after the console.log('end:');is called.
  4. Finally 1 second after, the testFor() is executed and itprints 0 1 2 3 4

The next point is that there is an error in your code!

function parseSites(urls) {    var parsedSites = [];    urls.forEach(function(site) {        request(site, function(err, res, body) {            if(err) {                console.log(err);            } else {                var $ = cheerio.load(body);                parsedSites.push($('title').text());            }        //} ! THIS bracket should be removed        });    });    return parsedSites;}

So your problem is that the 'request' in the forEach loop is an async function that will call the callback 'function(err, res, body)' once there is a response from the web page.

My solutions for this:

'use strict'const cheerio = require('cheerio');const request = require('request');const async = require('async');const urls = ['http://stackoverflow.com/','http://hackaday.com/','https://www.raspberrypi.org/','https://cheerio.js.org/'];//SOLUTION 1: do what you need to do when all calls are done using recursionlet i=0;let parsedSites = [];parseSites(urls[i],parsedSites);function finalCall(sites) {    console.log(sites);}function parseSites(site,parsedSites) {    ++i;    request(site, function(err, res, body) {        if(err) {            console.log(err);        } else {            let $ = cheerio.load(body);            let title = $('title').text();            console.log(title);            parsedSites.push(title);        }        if(i<urls.length){            parseSites(urls[i],parsedSites);// recursive call;        }        else{            finalCall(parsedSites);// when all sites are done.        }    });    //return parsedSites;// cant return! we are in async calls!}//SOLUTION 2: do what you need to do when all calls are done using 'async'parseSites(urls);function finalCall(sites) {    console.log(sites);}function parseSites(urls) {    let parsedSites = [];    async.each(urls,function parseSite(site, callback) {        request(site, function (err, res, body) {            if (err) {                callback(err);            } else {                let $ = cheerio.load(body);                parsedSites.push($('title').text());                callback();            }        })    },function (err) {        if(err) console.log(err);        else finalCall(parsedSites);    });}

Async github page

Async example