CasperJS循环或遍历多个网页
CasperJS loop or iterate through multiple web pages?
我有一个CasperJS脚本,可以从一个网页中抓取评分和日期。现在我想从同一网站下的多个页面上抓取相同的数据。给定以下代码,我如何循环浏览不同的子页面:
var ratings = [];
var dates = [];
var casper = require('casper').create({
pageSettings: {
loadImages: false,
loadPlugins: false
},
logLevel: "debug",
verbose: true
});
var fs = require('fs');
function getRatings() {
var ratings = document.querySelectorAll('#BVRRRatingOverall_Review_Display > div.BVRRRatingNormalImage > img');
return Array.prototype.map.call(ratings, function(e) {
return e.getAttribute('title');
});
}
function getDate() {
var dates = document.querySelectorAll('#BVSubmissionPopupContainer > div.BVRRReviewDisplayStyle5Header > div.BVRRReviewDateContainer > span.BVRRValue.BVRRReviewDate');
return Array.prototype.map.call(dates, function(e) {
return e.innerHTML;
});
}
casper.start('http://www.t-mobile.com/cell-phones/samsung-galaxy-s-5.html?bvrrp=9060/reviews/product/1/598aea53-16d0-4c12-b53a-105157092c52.htm', function(){
this.echo('hi');
});
casper.then(function() {
ratings = this.evaluate(getRatings);
dates = this.evaluate(getDate);
this.echo(ratings);
});
casper.run(function() {
this.echo(ratings.length + ' ratings found:');
for(var i=0; i<ratings.length; i++){
ratings[i] = ratings[i]+': '+dates[i];
dates[i] = '';
}
this.echo(ratings);
var content = ratings;
content = content.join("'n");
fs.write("C:/Users/Karan/Copy/tweesis/implementation/scraping/samsungratings.txt", content, 'w');
this.echo(dates.length + ' dates found:').exit();
});
感谢您的帮助:(
由于存在下一页按钮,您可以使用它递归遍历所有页面:
function getRatingsAndWrite(){
ratings = casper.evaluate(getRatings);
dates = casper.evaluate(getDate);
casper.echo(ratings);
casper.echo(ratings.length + ' ratings found:');
for(var i=0; i<ratings.length; i++){
ratings[i] = ratings[i]+': '+dates[i];
dates[i] = '';
}
casper.echo(ratings);
var content = ratings;
content = content.join("'n");
fs.write("C:/Users/Karan/Copy/tweesis/implementation/scraping/samsungratings.txt", content, 'a');
casper.echo(dates.length + ' dates found:');
var nextLink = ".BVRRPageLink.BVRRNextPage > a";
if (casper.visible(nextLink)) {
casper.thenClick(nextLink);
casper.then(getRatingsAndWrite);
} else {
casper.echo("END")
}
}
casper.start('http://www.t-mobile.com/cell-phones/samsung-galaxy-s-5.html?bvrrp=9060/reviews/product/1/598aea53-16d0-4c12-b53a-105157092c52.htm');
casper.then(getRatingsAndWrite);
casper.run();
一个相关的答案是A:点击按钮后,CasperJS解析下一页。
此代码可以帮助您:您可以在一个对象数组中定义所需的url、每个页面的选择器,并在循环中对这些属性执行您想要执行的操作。
你可以在循环中使用点击方法,而不是url。
var navigation = [
{
url: 'http://www.t-mobile.com/cell-phones/samsung-galaxy-s-5.html?bvrrp=9060/reviews/product/1/598aea53-16d0-4c12-b53a-105157092c52.htm',
selectorRatings:'#BVRRRatingOverall_Review_Display > div.BVRRRatingNormalImage > img', selectorDate :'#BVSubmissionPopupContainer > div.BVRRReviewDisplayStyle5Header > div.BVRRReviewDateContainer > span.BVRRValue.BVRRReviewDate'
}
,{
url: 'yourSecondUrl, etc...',
selectorRatings:'#BVRRRatingOverall_Review_Display > div.BVRRRatingNormalImage > img',
selectorDate :'#BVSubmissionPopupContainer > div.BVRRReviewDisplayStyle5Header > div.BVRRReviewDateContainer > span.BVRRValue.BVRRReviewDate'
}
],
content = "";
casper.start()
.then(function(){
//loop on the array
navigation.forEach(function(navIndex){
//open url : property url
casper.thenOpen(navIndex.url)
//wait for the page to load -> must be useless because thenOpen() do it
.waitForUrl(navIndex.url, function(){
//get the value of attribute title of adequate selector
var ratings = this.getElementAttribute(navIndex.selectorRatings, 'title'),
//get the HTML of adequate selector
var dates = this.getHTML(navIndex.selectorDates);
this.echo(ratings);
this.echo(dates);
content = content + ' ' + ratings + ' ' + dates;
});
});
})
.run(function() {
this.echo('----------- All steps done ------------'n');
this.exit();
});
感谢Fanch和Artjom B.您的两个回答都提供了有效的解决方案。我使用Artjom B.给出的递归遍历分页上的"next"页面。接下来,我添加了一个wait((函数,以确保在抓取评级页面之前加载下一个评级页面。如果没有这个wait((函数,我们会在单击"next"的瞬间和resp之间多次抓取同一页面。下一页加载完成。请参阅下面的工作代码:
var ratings = [];
var dates = [];
var casper = require('casper').create({
pageSettings: {
loadImages: false,
loadPlugins: false
},
logLevel: "debug",
verbose: true
});
var fs = require('fs');
function getRatings() {
var ratings = document.querySelectorAll('#BVRRRatingOverall_Review_Display > div.BVRRRatingNormalImage > img');
return Array.prototype.map.call(ratings, function(e) {
return e.getAttribute('title');
});
}
function getDate() {
var dates = document.querySelectorAll('#BVSubmissionPopupContainer > div.BVRRReviewDisplayStyle5Header > div.BVRRReviewDateContainer > span.BVRRValue.BVRRReviewDate');
return Array.prototype.map.call(dates, function(e) {
return e.innerHTML;
});
}
function getRatingsAndWrite(){
ratings = casper.evaluate(getRatings);
dates = casper.evaluate(getDate);
casper.echo(ratings.length + ' ratings found:');
for(var i=0; i<ratings.length; i++){
var rating = ratings[i].substr(0,1);
ratings[i] = rating +': '+dates[i];
dates[i] = '';
}
var content = ratings;
content = content.join("'n");
fs.write("<filepath to write content>", content, 'a');
casper.echo(dates.length + ' dates found:');
var nextLink = ".BVRRPageLink.BVRRNextPage > a";
if (casper.visible(nextLink)) {
casper.thenClick(nextLink);
casper.wait(3000);
casper.then(getRatingsAndWrite);
} else {
casper.echo("END")
}
}
casper.start('http://www.t-mobile.com/cell-phones/htc-one-m8.html');
casper.then(getRatingsAndWrite);
casper.run();
相关文章:
- 循环遍历以数组为值的Javascript对象
- 遍历类元素数组,并在jquery中选择同级元素
- Jquery遍历表元素
- Chrome扩展:遍历不同的页面并收集数据
- 如何遍历包含对象的数组-javascript
- 遍历 JSON 对象并检查 URL 是否以某个值结尾
- 遍历AngularJs中的对象
- JQuery 遍历当前 SELECT 值
- 循环遍历包含另一个表单的表单
- 使用Yadda和Protractor重用步骤定义,遍历所需文件
- 遍历D3中所有数据点之间的所有值
- 自动遍历所有链接的事件
- JS.循环遍历多维数组,以计数元素在每列中的出现次数
- 如何使用 document.querySelectorAll 遍历选定的元素
- 使用Javascript反向遍历XML
- 当知道同一hiearch中至少有一个元素时,遍历到元素.结构使用jquery
- Netsuite Suitelet:在不达到治理限制的情况下,遍历事务行项目的列表加载和提交记录
- 如何使用 JavaScript 遍历网页中的嵌入式资源
- CasperJS循环或遍历多个网页
- 用javascript遍历网页