在Array中存储多个抓取数据

Store multiple scraping data in an Array

本文关键字:抓取 数据 存储 Array      更新时间:2023-09-26

第一部分:

目前我刮一个网页得到链接,然后我打开每个刮的链接,这部分工作完美。

第二部分:

我检查是否有(select)选项字段的选择,并将值(id)存储在数组中,这部分也可以正常工作。

第三部分:

我想循环通过他们(选项的选择),并触发点击事件,等待AJAX响应,然后提取infoProduct,然后存储在一个数组中。

我在这部分遇到了麻烦,我得到了一个空数组,因为return listProducts在第一个this.eachThen开始之前被调用。

function getInfosProduct(obj,variation) {
    if (!variation) {
        return [{
            name: getName(),
            image: products.image.getElement.link + getImage(),
            url: obj.url
        }];
    } else {
        return {
            name: getName(),
            image: products.image.getElement.link + getImage(),
            url: variation.url,
            idVariation: variation.id,
            descVariation: variation.description
        };
    }
}
function clickVariation(variation) {
    if (variation.ok && variation.id != variation.ignore) {
        chooseVariation(products.selector, variation.id);
        return true;
    }
    return false;
}
casper.getInfosProducts = function(obj) {
    if (obj.level == 0) {
        return this.evaluate(getInfosProduct, obj,false);
    } else {
        listProducts = [];
        this.eachThen(obj.levelVariation, function getInfosProducts(variation) {
            isClick = this.evaluate(clickVariation, variation.data)
            if (isClick) {
                this.waitForSelectorTextChange('.selector', function() {
                    this.echo('The text on .selector has been changed.');
                });
                listProducts.push(this.evaluate(getInfosProduct, obj,variation.data));
                                }
        });
        return listProducts;
    }
};

触发select元素的更改事件的函数

function chooseVariation(selector, valueToMatch) {
    var select = document.querySelectorAll(selector),
        found = false;
    Array.prototype.forEach.call(select, function(opt, i) {
        if (!found && opt.value.indexOf(valueToMatch) !== -1) {
            select.selectedIndex = i;
            found = true;
        }
    });
    // dispatch change event in case there is some kind of validation
    var evt = document.createEvent("UIEvents"); // or "HTMLEvents"
    evt.initUIEvent("change", true, true);
    select[0].dispatchEvent(evt);
}

这是主函数:

    function startSraping(obj) {
        casper.then(function switchAction() {
            switch (obj.action) {
                //......... some code ........
          // iterating through array links and open pages
            case "openLinks":
                this.each(links, function eachOpenLinks(self, link) {
                    if (link.ok) {
                        self.thenOpen(link.url, function thenOpenLinks() {
                            startSraping({
                                url: link.url,
                                action: "getVariations"
                            });
                        });
                    } 
                });
                break;
                    // get all variations for each page opend
                case "getVariations":
                    objVariations = this.getVariations(obj.url);
                    startSraping({
                        url: obj.url,
                        action: "getInfosProducts",
                        objVariations: objVariations
                    });
                    break;
                case "getInfosProducts":
                    this.eachThen(obj.objVariations.list, function(levelVariation) {
                        infosProd = this.getInfosProducts({
                            levelVariation: levelVariation.data,
                            url: obj.url,
                            level: obj.objVariations.level
                        });
                      // Here I got an empty array
                        this.echo(JSON.stringify(infosProd), 'INFO');
                    });
                    break;
            }
        });
    }
    casper.start(url, function start() {
        startSraping({
            variation: variation,
            action: "submitSearch"
        });
    });
    casper.run();

你不能在一个函数内部调用异步函数(eachThenwaitForSelectorTextChange都是异步的),这个函数应该以同步的方式返回异步函数的结果(一般参考)。由于CasperJS不支持Promises,这就有点棘手了。

我认为以下的改变应该是最小的,并得到你想去的地方。

casper.getInfosProducts = function(obj, callback) {
    if (obj.level == 0) {
        this.then(function(){
            callback.call(this, arr.push(this.evaluate(getInfosProduct, obj,false));
        });
    } else {
        var listProducts = [];
        this.eachThen(obj.levelVariation, function getInfosProducts(variation) {
            var isClick = this.evaluate(clickVariation, variation.data)
            if (isClick) {
                this.waitForSelectorTextChange('.selector', function() {
                    this.echo('The text on .selector has been changed.');
                    listProducts.push(this.evaluate(getInfosProduct, obj, variation.data));
                });
            }
        });
        this.then(function(){
            callback.call(this, listProducts);
        });
    }
};

In startSraping:

case "getInfosProducts":
    this.eachThen(obj.objVariations.list, function(levelVariation) {
        this.getInfosProducts({
            levelVariation: levelVariation.data,
            url: obj.url,
            level: obj.objVariations.level
        }, function (infosProd){
            // this is the asynchronous callback
            this.echo(JSON.stringify(infosProd), 'INFO');
        });
    });
    break;