从页面上点击的链接中删除数据,然后移动到下一页&在CasperJS中重复

Scraping data from clicked links on page then moving to next page & repeating in CasperJS

本文关键字:一页 amp CasperJS 移动 数据 删除 链接 然后      更新时间:2023-10-10

在casperjs递归处理页面上的链接后,我很难让它移到下一页。

我可以让它从每个页面获取数据并在页面中移动,或者点击页面上的每个链接,但我不能同时做到这两件事。

var utils = require('utils');
var x = require('casper').selectXPath;
var casper = require('casper').create({
  verbose: true,
  logLevel: 'error',
  waitTimeout: 10000,
  pageSettings: {
    loadImages: false,
    loadPlugins: false,
    userAgent: 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36     (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36'
  }
});
var currentPage = 1;
var i = 0;
var links = [];
var link_titles = [];

var terminate = function() {
    this.echo("Exiting..").exit();
};
function getSelectedPage() {
    var el = document.querySelector('td.cur');
    return parseInt(el.textContent);
}
function getPageLinks () {
  var links = document.querySelectorAll('h3.r a');
  return [].map.call(links, function(link) {
    return link.getAttribute('href');
  });
}

function getLinkData(link) {
  this.thenOpen(link, function() {

    var title = this.getTitle();

    // Add the data from link
    var data = {
      title: title,
    };
    link_titles.push(data);
  });
}
function loopThroughLinks() {  
 if( i < links.length) {
    this.echo('[LINK #' + i + '] '+ links[i]);
    getLinkData.call(this, links[i]);
    i++;
    this.run(loopThroughLinks);
 }  else {
    utils.dump(link_titles);
    }       
}

function linkData(){
    links = this.evaluate(getPageLinks);
    this.run(loopThroughLinks);
}

var processPage = function() {  
    this.run(linkData);
    //PROBLEM EXISTS BELOW HERE - IF YOU COMMENT OUT FROM HERE IT RUNS AS EXPECTED FOR THE FIRST PAGE
    //WITH CODE BELOW INCLUDED, SKIPS this.run(linkData) AND JUST GOES THROUGH PAGES;
    this.then(function(){
    if (currentPage >= 3) {
        return terminate.call(casper);
    }
    currentPage++;
    this.echo("requesting next page: " + currentPage);
    this.capture("google-results-p" + currentPage + ".png");
    this.thenClick('a.pn span').then(function(){
        this.waitFor(function(){
            return currentPage === this.evaluate(getSelectedPage);
        }, processPage, terminate);
     }); 
  });   //COMMENT OUT TO HERE FOR WORKING ONE PAGE VERSION
}

casper.start('https://www.google.co.uk/?gws_rd=ssl#q=casperjs');
casper.run(processPage);

已更新代码以反映多个运行调用。现在正确地循环第一页,但从第一页打印所有其他页的结果??

var utils = require('utils');
var x = require('casper').selectXPath;
var casper = require('casper').create({
  verbose: true,
  logLevel: 'error',
  waitTimeout: 10000,
  pageSettings: {
    loadImages: false,
    loadPlugins: false,
    userAgent: 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36'
  }
});
var currentPage = 1;
var i = 0;
var links = [];
var link_titles = [];

var terminate = function() {
    this.echo("Exiting..").exit();
};
function getSelectedPage() {
    var el = document.querySelector('td.cur');
    return parseInt(el.textContent);
}
function getPageLinks() {
    var links = document.querySelectorAll("h3.r a");
    return Array.prototype.map.call(links, function(e) {
        try {
            // google handles redirects hrefs to some script of theirs
            return (/url'?q=(.*)&sa=U/).exec(e.getAttribute("href"))[1];
        } catch (err) {
            return e.getAttribute("href");
        }
    });
} 
function getLinkData(link) {
  this.thenOpen(link, function() {
    //var title = this.fetchText('title');
    var title = this.getTitle();

    // Add the staff data from link
    var data = {
      title: title,
    };
    link_titles.push(data);
    this.then(function(){  ///ADDED - BACK TO RIGHT PAGE FOR SELECTOR
    this.back();
    });
  });
}
function loopThroughLinks() {  
 if( i < links.length) {
    this.echo('[LINK #' + i + '] '+ links[i]);
    getLinkData.call(this, links[i]);
    i++;
    this.then(loopThroughLinks);
 }  else {
    utils.dump(link_titles);
    }       
}

function linkData(){
    links = this.evaluate(getPageLinks);
    this.then(loopThroughLinks);
}

var processPage = function() {  
    this.wait(2000, function(){
    this.then(linkData);
    });

    this.wait(2000, function(){
    this.then(function(){
    if (currentPage >= 3) {
        return terminate.call(casper);
    }

    this.echo("requesting next page: " + currentPage);
    this.capture("google-results-p" + currentPage + ".png");

    currentPage++;

    this.thenClick('a.pn span').then(function(){
        this.capture('google-results-2-p' + currentPage + '.png');
        this.waitFor(function(){
            return currentPage === this.evaluate(getSelectedPage);
        }, processPage, terminate);
     }); 
  });
 });
}

casper.start('https://www.google.co.uk/?gws_rd=ssl#q=casperjs');
casper.then(processPage);
casper.run();

您必须只有一个casper.run()(和一个casper.start())调用。run()启动CasperJS步骤队列,如果没有进一步的步骤,将完成执行。唯一需要保留的调用是casper.run(processPage);,但所有其他this.run(...)调用都需要更改为this.then(...)