如何在phantomjs中创建实体
How to create an entity in phantomjs?
我正在尝试为谷歌搜索结果编写抓取器。这是我写的:
var system = require('system');
var args = system.args;
var webPage = require('webpage');
var page = webPage.create();
var useragent = [];
useragent.push('Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5355d Safari/8536.25');
useragent.push('Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0');
useragent.push('Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.115 Safari/537.36 OPR/27.0.1689.76');
useragent.push('Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.111 Safari/537.36');
useragent.push('Mozilla/5.0 (Windows NT 6.3; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0');
page.settings.userAgent = useragent[Math.floor(Math.random() * useragent.length)];
var request = "search?q=",
newPage = "&start=",
localInfo,
depth;
var gUrl = 'http://google.com/',
yaUrl = 'http://yahoo.com/',
url = '';
var searchPages = [],
links;
request += args[1];
url += gUrl + request;
depth = args[2];
function pageHandler(uri) {
page.open(uri, function (status) {
if (status === 'success') {
page.injectJs('./libs/jquery-2.1.3.min.js');
links = page.evaluate(function () {
return $("li.g h3 a").map(function () {
return this.href;
}).get();
});
localInfo = page.evaluate(function() {
return $("#swml_addr").text();
});
console.log(links.join(''n'));
console.log(links.length);
console.log(localInfo);
setTimeout(nextPage, 1000);
}
});
}
function prepareSearchPages() {
for (var numPage = 0; numPage < depth; numPage++) {
url += newPage+10*numPage;
searchPages.push(url);
url = url.substr(0,url.indexOf(newPage));
}
nextPage();
}
var nextPage = function() {
var file = searchPages.shift();
if(!file) phantom.exit();
pageHandler(file);
};
prepareSearchPages();
很可能所有人都认为这看起来很可怕。但效果很好。因此,我决定为搜索引擎分配一个单独的实体
var webPage = require('webpage');
var page = webPage.create();
function searchEngine(engConfig) {
var _engineUrl = engConfig.rootDomain;
var _engineRequest = engConfig.requestPrefix;
var _engineNewPage = engConfig.newPagePrefix;
var _linkWrapperSelector = engConfig.linkWrapperSelector;
var _locSelector = engConfig.locSelector;
var _localInfo;
var _searchPagesUrls = [];
var _resultLinks;
var pageHandler = function(uri) {
page.open(uri, function (status) {
if (status === 'success') {
page.injectJs('./libs/jquery-2.1.3.min.js');
_resultLinks = page.evaluate(function(_linkWrapperSelector) {
return $(_linkWrapperSelector).map(function () {
return this.href;
}).get();
});
_localInfo = page.evaluate(function(_locSelector) {
return $(_locSelector).text();
});
console.log(_resultLinks.join(''n'));
console.log(_resultLinks.length);
console.log(_localInfo);
setTimeout(nextPage, 1000);
}
});
};
var nextPage = function() {
var file = _searchPagesUrls.shift();
if(!file) phantom.exit();
pageHandler(file);
};
this.runSearch = function(keyPhrase, depthSearch) {
var url = _engineUrl+_engineRequest+keyPhrase;
for (var numPage = 0; numPage < depthSearch; numPage++) {
url += _engineNewPage+10*numPage;
_searchPagesUrls.push(url);
url = url.substr(0,url.indexOf(_engineNewPage));
}
nextPage();
};
this.showLinks = function() {
return _resultLinks.join(''n');
};
}
var googleOptions = {
rootDomain: 'http://google.ru/',
requestPrefix: 'search?q=',
newPagePrefix: '&start=',
linkWrapperSelector: 'li.g h3 a',
locSelector: '#swml_addr'
};
var useragent = [];
useragent.push('Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5355d Safari/8536.25');
useragent.push('Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0');
useragent.push('Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.115 Safari/537.36 OPR/27.0.1689.76');
useragent.push('Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.111 Safari/537.36');
useragent.push('Mozilla/5.0 (Windows NT 6.3; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0');
page.settings.userAgent = useragent[Math.floor(Math.random() * useragent.length)];
var google = new searchEngine(googleOptions);
google.runSearch('Hello', 1);
不幸的是,它不起作用。我不明白为什么。也许我搞砸了范围。
附言:此代码的第一个版本工作正常,并在控制台中显示所有链接。第二个版本的代码只输出0
,但传递给pageHandler
函数的uri是正确的。甚至不会显示"未定义"或类似的内容。
page.evaluate()
是沙盒页面上下文。它无法访问外部定义的变量。您必须显式地将_linkWrapperSelector
传递给它:
_resultLinks = page.evaluate(function(_linkWrapperSelector) {
return $(_linkWrapperSelector).map(function () {
return this.href;
}).get();
}, _linkWrapperSelector); // this here
_locSelector
:也是如此
_localInfo = page.evaluate(function(_locSelector) {
return $(_locSelector).text();
}, _locSelector); // this here
相关文章:
- 创建一个类似链接的按钮,并通过Javascript函数打开一个新的弹出窗口
- 为effect Composer创建GodRays效果过程
- 从javascript创建一个列表
- 微风从现有实体创建实体
- 如何在phantomjs中创建实体
- 创建Crafty JS实体的类(类的类?)
- 用于使用 DateTime 创建新实体的 API 控制器
- 是否可以捕获内部创建的必应地图实体的事件
- 创建来自 Twitter REST API 实体(主题标签、链接、提及等)的推文文本的链接
- Knockout.js toJSON 从 Breeze 实体创建空对象
- 如何在javascript中创建实体
- 通过选择框创建实体
- 依靠其他实体在微风中创建实体
- 基于实体属性值创建数据网格
- 创建新实体时用javascript刷新视图
- 如何在JHipster 3.x中创建具有相关实体列表的实体视图
- 无法识别微风创建实体类型
- SAPUI5创建带有日期的OData实体——生成以CX_SXML_PARSE_ERROR结束的错误请求负载
- 在不修改/创建实体记录的情况下触发插件
- 基于数据库实体创建模型