如何访问NodeWebkit中html代码中的节点模块中的抓取数据

How do I access the scraped data from node module in my html code in NodeWebkit

本文关键字:代码 节点 模块 数据 抓取 html 访问 NodeWebkit 何访问      更新时间:2023-09-26

我正在尝试使用NodeWebkit创建一个应用程序。我正在使用节点幻影简单模块抓取内容。使用该模块,我能够从网站上抓取内容。但是我应该如何在html端访问它。我不认为我能为这个案子创建一个休息服务。这是代码示例:

var file = require('file.js');
var gui = require('nw.gui');
var menu = new gui.Menu({ type: 'menubar' });
var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app     = express();
var driver = require('node-phantom-simple');

url = 'http://www.espncricinfo.com/';
request(url, function(error, response, html){
    if(!error){
        var $ = cheerio.load(html);
        var title, release, rating;
        var json = { title : "", release : "", rating : ""};
        $('.scoreline-list').first().filter(function(){
            var data = $(this);
            var numOfMatches = data.children().length;
            console.log("Number of Matches:  ",numOfMatches);
            var matches=[];
            //GET URL FOR EACH MATCH
            for(x=0;x<numOfMatches;x++)
            {
                var lielem = data.children().eq(x);
                matches[x] = "http://www.espncricinfo.com" + lielem.children().first().attr('href');
                $('#editor').val(matches[x]);
                console.log(matches[x]);
            }
            //FOR EACH MATCH URL
            for(x=0;x<numOfMatches;x++)
            {  
                var matchurl = matches[x];   
                //console.log(matchurl);
                driver.create({ path: require('phantomjs').path }, function (err, browser) {
                  return browser.createPage(function (err, page) {
                    return page.open(matchurl, function (err,status) {
                      console.log("opened site? ", status);
                      page.includeJs('http://ajax.googleapis.com/ajax/libs/jquery/1.7.2/jquery.min.js', function (err) {
                        // jQuery Loaded. 
                        // Wait for a bit for AJAX content to load on the page. Here, we are waiting 5 seconds. 
                        setTimeout(function () {
                          return page.evaluate(function () {
                            //Get what you want from the page using jQuery. A good way is to populate an object with all the jQuery commands that you need and then return the object. 
                            var h2Arr = [];
                            $('.innings-information').each(function () { h2Arr.push($(this).html()); });
                            return {
                              h2: h2Arr
                            };
                          }, function (err,result) {
                            console.log(result);
                            browser.exit();
                          });
                        }, 5000);
                      });
                      });
                  });
                });
             setTimeout(function(){
                    //waiting for the jquery to load
                }, 5000);
            } //END FOR LOOP EACH MATCH URL


        })
    }
})

谢谢你的帮助!

为什么不能为此创建一个rest端点?只需缓存每个抓取操作的结果,然后在http端点上返回缓存。

var cache={};
app.get('/myendpoint', function(req, res) {
    res.json(cache);
})
request(url, function(error, response, html){
    ...
    setTimeout(function () {
        ...
            console.log(result);
            //set cache here
            cache=result;
            browser.exit();
        });
   }, 5000);
});
app.listen(1338);

如果要缓存以进行更新,请将抓取函数封装在setInterval中。

var cache={};
app.get('/myendpoint', function(req, res) {
    res.json(cache);
})
function updateCache() {
    request(url, function(error, response, html){
        ...
        setTimeout(function () {
            ...
                console.log(result);
                //set cache here
                cache=result;
                browser.exit();
            });
       }, 5000);
    });
}
//Update cache every 60 secs.
setInterval(updateCache, 60000);
app.listen(1338);