在执行页面's javascript后保存页面的HTML输出

save html output of page after execution of the page's javascript

本文关键字：保存 HTML javascript 输出执行更新时间：2023-09-26

有一个网站，我试图刮，首先加载html/js使用js修改表单输入字段，然后使用post。我怎样才能得到post页面的最终html输出?

我试图用phantomjs这样做，但它似乎只有一个选项来渲染图像文件。在谷歌上搜索一下，这应该是可能的，但我不知道怎么做。我的尝试:

var page = require('webpage').create();
var fs = require('fs');
page.open('https://www.somesite.com/page.aspx', function () {
    page.evaluate(function(){
    });
    page.render('export.png');
    fs.write('1.html', page.content, 'w');
    phantom.exit();
});

这段代码将用于客户端，我不能指望他安装太多的包(nodejs, casperjs等)

谢谢

您的输出代码是正确的，但是存在同步性问题。在页面加载完成之前，将执行您拥有的输出行。你可以绑定到onLoadFinished回调函数中去发现它何时发生。完整代码如下:

    var page = new WebPage()
    var fs = require('fs');
    page.onLoadFinished = function() {
      console.log("page load finished");
      page.render('export.png');
      fs.write('1.html', page.content, 'w');
      phantom.exit();
    };
    page.open("http://www.google.com", function() {
      page.evaluate(function() {
      });
    });

当使用像google这样的网站时，它可能是欺骗性的，因为它加载速度如此之快，以至于你经常可以像你拥有的那样执行内联截图。在phantomjs中，计时是一件棘手的事情，有时我用setTimeout进行测试，看看计时是否是一个问题。

当我直接复制您的代码并将URL更改为www.google.com时，它工作正常，保存了两个文件:

export.png

请记住，文件将写入您运行脚本的位置，而不是您的.js文件所在的位置

经过2天的挣扎和沮丧，我终于解决了我的类似问题。在PhantomJS的官方网站上的waitfor.js的例子是怎么做到的。是快乐!

"use strict";
function waitFor(testFx, onReady, timeOutMillis) {
    var maxtimeOutMillis = timeOutMillis ? timeOutMillis : 3000, //< Default Max Timout is 3s
        start = new Date().getTime(),
        condition = false,
        interval = setInterval(function() {
            if ( (new Date().getTime() - start < maxtimeOutMillis) && !condition ) {
                // If not time-out yet and condition not yet fulfilled
                condition = (typeof(testFx) === "string" ? eval(testFx) : testFx()); //< defensive code
            } else {
                if(!condition) {
                    // If condition still not fulfilled (timeout but condition is 'false')
                    console.log("'waitFor()' timeout");
                    phantom.exit(1);
                } else {
                    // Condition fulfilled (timeout and/or condition is 'true')
                    console.log("'waitFor()' finished in " + (new Date().getTime() - start) + "ms.");
                    typeof(onReady) === "string" ? eval(onReady) : onReady(); //< Do what it's supposed to do once the condition is fulfilled
                    clearInterval(interval); //< Stop this interval
                }
            }
        }, 250); //< repeat check every 250ms
};

var page = require('webpage').create();
// Open Twitter on 'sencha' profile and, onPageLoad, do...
page.open("http://twitter.com/#!/sencha", function (status) {
    // Check for page load success
    if (status !== "success") {
        console.log("Unable to access network");
    } else {
        // Wait for 'signin-dropdown' to be visible
        waitFor(function() {
            // Check in the page if a specific element is now visible
            return page.evaluate(function() {
                return $("#signin-dropdown").is(":visible");
            });
        }, function() {
           console.log("The sign-in dialog should be visible now.");
           phantom.exit();
        });
    }
});

我尝试了几种方法来完成类似的任务，使用Selenium获得了最好的结果。

在我尝试PhantomJS和Cheerio之前。当在页面上执行JS时，Phantom经常崩溃

我使用CasperJS与PhantomJS运行测试。我将这段代码添加到我的tearDown函数中:

var require = patchRequire(require);
var fs = require('fs');
casper.test.begin("My Test", {
    tearDown: function(){
        casper.capture("export.png");
        fs.write("1.html", casper.getHTML(undefined, true), 'w');
    },
    test: function(test){
        // test code
        casper.run(function(){
            test.done();
        });
    }
});

除了使用无头浏览器之外，我想到的一种方法显然是模拟ajax调用，并逐个请求地集成页面的后期处理。然而，这通常有点棘手，应该作为最后的手段，除非你真的想挖掘javascript代码。

这可以很容易地完成一些php代码和javascript使用fopen()和fwrite()下面这个函数保存它:var generatedSource = new XMLSerializer().serializeToString(document);