Using Request.js and Cheerio.js in Node/Express return empty array
Please refer to the below code for a working implementation
var request = require('request-promise') var cheerio = require("cheerio") function parseSites(urls, callback) { var parsedSites = []; var promiseList = urls.map(getPage) Promise.all(promiseList).then(function (data) { callback(data.map(parse)) }) return parsedSites; } function getPage(url) { return request.get(url) } function parse(body) { console.log("parsing body") var $ = cheerio.load(body); return $('title').text() } parseSites(['https://www.google.com','https://www.facebook.com'],function(data) { console.log(data) })
First you need to understand the difference between asynchronous and synchronous code. Lets see an example:
function testFor() { for(let i=0;i<5;++i){ console.log(i); }}
-
console.log('start:');testFor();console.log('end:');// Here you get the expected output because this code is synchronous.//output: start: 0 1 2 3 4 end:
-
console.log('start:');setTimeout(testFor,1000);console.log('end:');// Here you don't get your expected output because setTimeout is asynchronous .//output: start: end: 0 1 2 3 4
- First the console.log('start:'); is called.
- Then setTimeout(testFor,1000); (but it is async and the callwill execute in 1 second).
- Immediately after the console.log('end:');is called.
- Finally 1 second after, the testFor() is executed and itprints 0 1 2 3 4
The next point is that there is an error in your code!
function parseSites(urls) { var parsedSites = []; urls.forEach(function(site) { request(site, function(err, res, body) { if(err) { console.log(err); } else { var $ = cheerio.load(body); parsedSites.push($('title').text()); } //} ! THIS bracket should be removed }); }); return parsedSites;}
So your problem is that the 'request' in the forEach loop is an async function that will call the callback 'function(err, res, body)' once there is a response from the web page.
My solutions for this:
'use strict'const cheerio = require('cheerio');const request = require('request');const async = require('async');const urls = ['http://stackoverflow.com/','http://hackaday.com/','https://www.raspberrypi.org/','https://cheerio.js.org/'];//SOLUTION 1: do what you need to do when all calls are done using recursionlet i=0;let parsedSites = [];parseSites(urls[i],parsedSites);function finalCall(sites) { console.log(sites);}function parseSites(site,parsedSites) { ++i; request(site, function(err, res, body) { if(err) { console.log(err); } else { let $ = cheerio.load(body); let title = $('title').text(); console.log(title); parsedSites.push(title); } if(i<urls.length){ parseSites(urls[i],parsedSites);// recursive call; } else{ finalCall(parsedSites);// when all sites are done. } }); //return parsedSites;// cant return! we are in async calls!}//SOLUTION 2: do what you need to do when all calls are done using 'async'parseSites(urls);function finalCall(sites) { console.log(sites);}function parseSites(urls) { let parsedSites = []; async.each(urls,function parseSite(site, callback) { request(site, function (err, res, body) { if (err) { callback(err); } else { let $ = cheerio.load(body); parsedSites.push($('title').text()); callback(); } }) },function (err) { if(err) console.log(err); else finalCall(parsedSites); });}