如何在phantomjs中创建实体

How to create an entity in phantomjs?

本文关键字:创建 实体 phantomjs      更新时间:2024-06-29

我正在尝试为谷歌搜索结果编写抓取器。这是我写的:

var system = require('system');
var args = system.args;
var webPage = require('webpage');
var page = webPage.create();
var useragent = [];
useragent.push('Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5355d Safari/8536.25');
useragent.push('Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0');
useragent.push('Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.115 Safari/537.36 OPR/27.0.1689.76');
useragent.push('Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.111 Safari/537.36');
useragent.push('Mozilla/5.0 (Windows NT 6.3; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0');
page.settings.userAgent = useragent[Math.floor(Math.random() * useragent.length)]; 
var request = "search?q=",
    newPage = "&start=",
    localInfo,
    depth;
var gUrl = 'http://google.com/',
    yaUrl = 'http://yahoo.com/',
    url = '';
var searchPages = [],
    links;
    request += args[1];
    url += gUrl + request;
    depth = args[2];
    function pageHandler(uri) {
        page.open(uri, function (status) {
            if (status === 'success') {
                page.injectJs('./libs/jquery-2.1.3.min.js');
                links = page.evaluate(function () {
                    return $("li.g h3 a").map(function () {
                        return this.href;
                    }).get();
                });
                localInfo = page.evaluate(function() {
                   return $("#swml_addr").text();
                });
                console.log(links.join(''n'));
                console.log(links.length);
                console.log(localInfo);
                setTimeout(nextPage, 1000);
            }
        });
    }
    function prepareSearchPages() {
        for (var numPage = 0; numPage < depth; numPage++) {
            url += newPage+10*numPage;
            searchPages.push(url);
            url = url.substr(0,url.indexOf(newPage));
        }
        nextPage();
    }
    var nextPage = function() {
        var file = searchPages.shift();
        if(!file) phantom.exit();
        pageHandler(file);
    };

    prepareSearchPages();

很可能所有人都认为这看起来很可怕。但效果很好。因此,我决定为搜索引擎分配一个单独的实体

var webPage = require('webpage');
var page = webPage.create();

function searchEngine(engConfig) {
    var _engineUrl = engConfig.rootDomain;
    var _engineRequest = engConfig.requestPrefix;
    var _engineNewPage = engConfig.newPagePrefix;
    var _linkWrapperSelector = engConfig.linkWrapperSelector;
    var _locSelector = engConfig.locSelector;
    var _localInfo;
    var _searchPagesUrls = [];
    var _resultLinks;
    var pageHandler = function(uri) {
        page.open(uri, function (status) {
            if (status === 'success') {
                page.injectJs('./libs/jquery-2.1.3.min.js');
                _resultLinks = page.evaluate(function(_linkWrapperSelector) {
                    return $(_linkWrapperSelector).map(function () {
                        return this.href;
                    }).get();
                });
                _localInfo = page.evaluate(function(_locSelector) {
                    return $(_locSelector).text();
                });
                console.log(_resultLinks.join(''n'));
                console.log(_resultLinks.length);
                console.log(_localInfo);
                setTimeout(nextPage, 1000);
            }
        });
    };
    var nextPage = function() {
        var file = _searchPagesUrls.shift();
        if(!file) phantom.exit();
        pageHandler(file);
    };
    this.runSearch = function(keyPhrase, depthSearch) {
        var url = _engineUrl+_engineRequest+keyPhrase;
        for (var numPage = 0; numPage < depthSearch; numPage++) {
            url += _engineNewPage+10*numPage;
            _searchPagesUrls.push(url);
            url = url.substr(0,url.indexOf(_engineNewPage));
        }
        nextPage();
    };
    this.showLinks = function() {
        return _resultLinks.join(''n');
    };

}
var googleOptions = {
    rootDomain: 'http://google.ru/',
    requestPrefix: 'search?q=',
    newPagePrefix: '&start=',
    linkWrapperSelector: 'li.g h3 a',
    locSelector: '#swml_addr'
};
var useragent = [];
useragent.push('Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5355d Safari/8536.25');
useragent.push('Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0');
useragent.push('Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.115 Safari/537.36 OPR/27.0.1689.76');
useragent.push('Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.111 Safari/537.36');
useragent.push('Mozilla/5.0 (Windows NT 6.3; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0');
page.settings.userAgent = useragent[Math.floor(Math.random() * useragent.length)];
var google = new searchEngine(googleOptions);
google.runSearch('Hello', 1);

不幸的是,它不起作用。我不明白为什么。也许我搞砸了范围。

附言:此代码的第一个版本工作正常,并在控制台中显示所有链接。第二个版本的代码只输出0,但传递给pageHandler函数的uri是正确的。甚至不会显示"未定义"或类似的内容。

page.evaluate()是沙盒页面上下文。它无法访问外部定义的变量。您必须显式地将_linkWrapperSelector传递给它:

_resultLinks = page.evaluate(function(_linkWrapperSelector) {
    return $(_linkWrapperSelector).map(function () {
        return this.href;
    }).get();
}, _linkWrapperSelector); // this here

_locSelector:也是如此

_localInfo = page.evaluate(function(_locSelector) {
    return $(_locSelector).text();
}, _locSelector); // this here