迭代node.js请求函数

iterate node.js request function

本文关键字:函数 请求 js node 迭代      更新时间:2023-09-26

这个问题是关于node.js中的爬网程序的。在start_url中,他抓取URL,并将它们"推送"到json-文件(output.json)。目前,他只使用start_url运行请求函数,并将收集到的URL保存在output.json中。我希望他使用保存的URL,将start_url替换为第一个收集的URL,然后再次收集链接。。。等等

var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var start_url = ["http://stackoverflow.com/"]
var req = function(url){
    request(url, function(error, response, html){
        var $ = cheerio.load(html);
        var data = [];
        $("a").each(function() {
            var link = $(this);
                var exurls = {exurl: new Array(link.attr("href"))}
                data.push(exurls);
                // Queue "exurls" for "start_url" and call the same function with the new URL (endless loop)
                // save to "output.json" from time to time, so you can stop it anytime
        });
        fs.writeFile("output.json", JSON.stringify(data, null, 4), function(err){
            if(err){
                console.log(err);
            } else {
                console.log("File successfully written!");
            }
        });
    });
}
for (var i = 0; i < start_url.length; i++){
    req(start_url[i]);
}

所以您可以做的是递归调用函数。下面的例子应该有效:

var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var start_url = ["http://stackoverflow.com/"]
var req = function(url){
  var count = 0;
  
  request(url, function(error, response, html){
    var $ = cheerio.load(html);
    $("a").each(function() {
      var link = $(this);
      var exurls = {exurl: new Array(link.attr("href"))}
      start_url.push(exurls);
      // Queue "exurls" for "start_url" and call the same function with the new URL (endless loop)
      // save to "output.json" from time to time, so you can stop it anytime
    });
    try {
      fs.writeFileSync("output.json");
      console.log("File successfully written!");
     }catch(err){
       console.log(err);
     }
      
      ++count;
      
      if(start_url.length > count) {
        req(start_url[count]);
      }
  });
}
return req(start_url[0]);

这样做的问题是,每次都要完全重写文件。如果这种情况持续一段时间,你的内存就会耗尽。另一个选项是创建一个写流

var fs = require('fs');
    var request = require('request');
    var cheerio = require('cheerio');
    var start_url = ["http://stackoverflow.com/"]
    
    var wstream = fs.createWriteStream("output.json");
    var req = function(url){
      
      request(url, function(error, response, html){
        var $ = cheerio.load(html);
        $("a").each(function() {
          var link = $(this);
          var exurls = {exurl: new Array(link.attr("href"))}
          start_url.push(exurls);
          // Queue "exurls" for "start_url" and call the same function with the new URL (endless loop)
          // save to "output.json" from time to time, so you can stop it anytime
          wstream.write('"'+ exurls + '",');
        });
          
        start_url.shift();
        if(start_url.length > 0) {
          return req(start_url[0]);
        }
          
          wstream.end();
      });
    }
    req(start_url[0]);

编辑:切换到一个基本队列,以解决的内存问题