您现在的位置是:网站首页> NodeJS

Puppeteer专题

  • NodeJS
  • 2023-12-02
  • 494人已阅读
摘要

Puppeteer和Headless chrome的简介和应用

Puppeteer性能优化与执行速度提升

node puppeteer拦截谷歌请求、设置浏览器响应-爬取电子书链接



Puppeteer和Headless chrome的简介和应用

Puppeteer在线文档

Puppeteer和Headless chrome的简介和应用

性能优化

const browser = await puppeteer.launch(

{

    headless:true,

    args: [

        ‘–disable-gpu’,

        ‘–disable-dev-shm-usage’,

        ‘–disable-setuid-sandbox’,

        ‘–no-first-run’,

        ‘–no-sandbox’,

        ‘–no-zygote’,

        ‘–single-process’

    ]

});



Puppeteer:在page.evaluate()中传参

const links = await page.evaluate((evalVar) => {

  console.log(evalVar); // should be defined now  …


}, evalVar);



滚动获得内容通过源码

/**

 * Created by Administrator on 2021-07-10.

 */

const puppeteer = require('puppeteer');

const devices = require('puppeteer/DeviceDescriptors');

const iPhone = devices['iPhone 6'];

function returnCrawler(result){

    console.log("####"+JSON.stringify(result)+"####");

}

function Trim(str)

{

    return str.replace(/(^\s*)|(\s*$)/g, "");

}

async function autoScroll(page){

    await page.evaluate(async () => {

        await new Promise((resolve, reject) => {

            var totalHeight = 0;

            var distance = 100;

            var timer = setInterval(() => {

                var scrollHeight = document.body.scrollHeight;

                window.scrollBy(0, distance);

                totalHeight += distance;


                if(totalHeight >= scrollHeight){


                    clearInterval(timer);

                    resolve();

                }

            }, 120);

        });

    });

}

(async () => {

    const browser = await puppeteer.launch({

        executablePath: '../chrome-win/chrome.exe',

        //executablePath: 'C:/Users/Administrator/AppData/Local/Google/Chrome/Application/chrome.exe',

        //executablePath: '../Chrome-bin/chrome.exe',

        args: [

            '--disable-images', // 允许跨域

            '--disable-web-security', // 允许跨域

            '--disable-infobars',

            '--start-maximized',

            //'-proxy-server=127.0.0.1:8888',


        ],

        headless: false,

        slowMo: 250,//延迟500毫秒

        ignoreDefaultArgs: ["--enable-automation"]

    });

    console.log(process.argv);

    const page = await browser.newPage();

    await page.evaluateOnNewDocument(() => {

        Object.defineProperty(navigator, 'webdriver', {

            get: () => undefined,

        });

        const originalQuery = window.navigator.permissions.query;

        return window.navigator.permissions.query = (parameters) => (

            parameters.name === 'notifications' ?

                Promise.resolve({ state: Notification.permission }) :

                originalQuery(parameters)

        );

        Object.defineProperty(navigator, 'languages', {

            get: () => ['en-US', 'en'],

        });

        Object.defineProperty(navigator, 'plugins', {

            get: () => [1, 2, 3, 4, 5],

        });

    });



    await page.setRequestInterception(true);

    let imageBytes = [-119, 80, 78, 71, 13, 10, 26, 10, 0, 0, 0, 13, 73, 72, 68, 82, 0, 0, 0, 64, 0,

        0, 0, 64, 8, 3, 0, 0, 0, -99, -73, -127, -20, 0, 0, 0, 4, 103, 65, 77, 65, 0, 0, -79, -113, 11, -4, 97,

        5, 0, 0, 0, 1, 115, 82, 71, 66, 0, -82, -50, 28, -23, 0, 0, 0, -64, 80, 76, 84, 69, 8, -111, -23, -124,

        -46, -1, -127, -50, -3, -125, -48, -1, 125, -52, -3, -123, -44, -1, 127, -50, -3, -1, -1, -1, -126, -50,

        -4, -121, -41, -1, -124, -49, -3, 13, -109, -22, 9, -111, -23, -116, -46, -4, 23, -103, -20, -121, -47,

        -3, 0, -116, -24, 107, -60, -6, 29, -100, -19, 114, -58, -6, 18, -106, -21, -125, -48, -2, 95, -67, -9,

        -105, -42, -4, -3, -2, -1, -48, -20, -3, 120, -54, -5, 41, -95, -18, -12, -5, -1, 81, -74, -11, -7, -3,

        -1, -95, -38, -3, -90, -37, -5, 35, -97, -18, 67, -81, -13, -126, -49, -3, 46, -92, -17, 90, -71, -11,

        -18, -8, -1, 5, -112, -23, 78, -77, -13, -71, -29, -3, -111, -44, -3, 123, -53, -4, 58, -86, -15, 51,

        -90, -17, 101, -66, -10, -36, -15, -2, -64, -26, -3, -81, -32, -3, 72, -79, -13, 55, -87, -15, -24,

        -11, -2, 58, -81, -5, 62, -83, -14, 65, -83, -15, -41, -17, -2, -58, -24, -3, -78, -33, -5, 118, -62,

        -12, -29, -13, -3, -115, -50, -9, -122, -55, -12, 42, -89, -7, 28, 71, -109, -109, 0, 0, 3, 94, 73, 68,

        65, 84, 88, -61, -19, 87, 91, 123, -94, 48, 16, 37, -102, -124, 112, -111, 59, 40, -32, 5, 11, 10, -118,

        -73, -74, -82, 90, -37, -18, -2, -1, 127, -75, -127, -86, 107, 77, 92, -30, -21, 126, 123, 30, -115, -25, 56, 57,

        51, -103, 25, 37, -23, 63, -2, -29, 31, -121, 110, -60, -117, -119, 11, -95, 27, 44, 98, 67, 127, -108, -83, -59,

        -123, 93, 2, 5, -7, -124, -8, 72, 1, -91, 61, -56, -76, 7, -24, -34, -50, -10, 107, -14, 25, 84, 4, -71, 47, -122,

        -24, -81, -17, 92, 5, -75, -56, 45, -128, 82, 62, 9, 69, 49, 94, 41, -128, 112, 1, 20, 123, -36, -52, 95, 98, -123,

        -36, 5, -62, -81, 77, -4, 39, 0, -56, 95, 0, -64, -66, -127, 127, 109, 29, 15, 62, -38, -3, -115, -65, 64, 41, 105,

        -128, -17, 103, -9, -7, 113, -22, -109, 70, -128, -24, 110, 58, 61, 23, 17, 1, 40, -63, -67, -70, 28, 40, 34, 124,

        -46, 66, 119, 82, 49, 38, -75, 1, 16, -30, 59, 76, 124, 58, 2, 46, -65, -96, 38, 85, 0, 88, 77, -114, 35, 62, 31,

        -50, -97, -97, -25, -75, -126, -78, -32, -15, 29, 88, 57, 104, 30, 100, 89, 14, -71, -4, -47, 80, -106, -5, 9, -84,

        -22, -55, -26, -123, -80, -81, 2, -128, 51, -7, 88, 76, -62, 25, 100, 5, 58, 31, 63, -117, -9, -2, -44, -84, 92, 0,

        49, -25, -7, -81, -22, 20, 76, 55, -110, 21, 127, 76, 85, 54, -128, -39, -54, -78, -34, -121, 114, 29, -126, 50, -32,

        -28, -80, -92, 55, -64, 115, 121, 98, -27, -53, -11, -90, -53, 24, -87, 38, -102, 100, -3, 60, 76, 55, -107, 0, -80,

        -39, 76, -114, 43, 7, 90, -35, 97, -106, -21, -71, 22, -113, 24, -127, -7, -69, -91, -21, -70, 118, 60, 84, 2, 126,

        -55, 22, -45, -94, -50, -63, 104, -22, -28, -70, -98, 123, 93, -26, 10, -35, -126, 10, 72, 86, 119, 93, -37, -61, 49,

        -31, -91, 22, 48, -41, 75, 43, -105, 114, -19, 76, 51, 77, -13, 100, 71, 103, 111, 73, 82, 110, 37, -57, -78, 78, -28,

        43, -73, 12, 97, 103, -76, -103, -19, 61, -55, 43, -66, -82, 109, 70, -63, 96, -30, 126, 73, -88, -10, 88, -41, -30,

        -49, 48, -103, 99, 122, 61, -27, -119, 91, 70, -101, 62, 45, -126, 94, -104, 116, 72, 68, 84, -107, -72, 123, 47, -73,

        44, -19, -43, 77, 85, 85, -123, 100, -98, -124, -43, -7, 48, -20, 96, -94, -68, 48, 2, 1, -62, 111, -14, 9, -61, -11,

        103, -80, 90, 77, -76, -36, 48, 60, -61, -77, -116, 96, 101, 7, -97, -21, -31, -7, -8, -61, 36, 74, -63, -42, 33, 30,

        -11, -28, 63, -40, -66, -123, -45, -107, -10, -125, -30, -105, -105, 108, 103, -57, -16, -6, -16, 13, -6, -82, -57,

        -44, 33, 12, -27, 91, 36, 3, 42, -80, 98, 63, -17, 119, 8, -54, 24, 11, -52, 41, -13, 69, -71, -9, 113, -40, -54, 28,

        -116, 48, -29, -30, 4, -51, -121, -78, 48, -98, 33, -29, 98, -127, -70, -113, 9, -36, -66, -24, -41, -121, 34, 24, 97,

        127, -52, 100, -127, -25, -63, 29, 12, -25, 126, 116, -101, 5, -35, 110, -121, -62, 2, 107, -84, 76, 56, 35, -27, 91,

        29, 52, 88, -128, -106, -100, -114, -122, 15, -126, -4, -87, -55, -42, 81, 85, -53, 64, 52, 15, 51, -56, -21, 72, -12,

        61, -61, 55, 33, 126, -65, -125, 57, -81, 89, -54, 34, 31, -117, -123, 16, 66, -30, -69, -52, -94, -80, 79, -85, -79,

        -66, 21, -77, -112, 54, 118, 120, -29, -30, 75, 61, -42, -95, 72, 38, 123, 117, -61, 5, -19, 111, -91, -8, -92, 124,

        77, -75, -115, -104, 5, -11, -104, 7, 87, 49, 100, 32, 61, -11, 126, 33, -127, -45, -104, -57, 23, 31, -116, 8, -112,

        -57, 5, -82, 6, -36, 101, -84, -105, 15, 9, 92, 58, -21, 56, 61, -17, 53, -16, -7, 33, -127, -45, 120, -47, -125, -53,

        94, -126, 19, -95, 66, -2, -77, -85, 20, -41, 1, -76, -38, 116, -126, 11, 8, 108, -23, 119, -37, -41, 33, -100, 28,

        -16, 65, -108, -30, 110, 79, -28, 45, 19, 18, -99, -74, -71, -54, 5, 47, -86, 55, 51, -32, -17, 98, -97, 116, -6, -51,

        2, -76, -89, -105, 78, -127, -38, -25, 68, 44, 107, 7, -108, 40, -109, -116, 50, 85, -73, 34, -107, 92, -19, 72, 59,

        92, -15, -46, -44, -111, 92, 64, 82, -124, 2, -121, 110, -23, 46, -128, 7, -111, 74, 70, 65, 101, -99, -83, -48, 123,

        -48, -34, 58, -64, 8, -37, 75, -3, 107, -70, 9, -44, -14, -10, 108, 62, -3, 87, 64, -105, 111, 58, -26, -99, -52, -47,

        47, 29, -95, 57, 13, 116, 67, 57, 119, 3, -51, -55, -66, -17, 25, 25, 106, 118, 113, -40, -59, 105, -53, -71, 34, -3,

        6, 71, 84, 119, -92, 72, -109, -15, -121, 0, 0, 0, 0, 73, 69, 78, 68, -82, 66, 96, -126];

    page.on('request', interceptedRequest => {

        //判断如果是 图片请求  就直接拦截

        if (interceptedRequest.url().endsWith('.png') || interceptedRequest.url().endsWith('.jpg')|| interceptedRequest.url().endsWith('.PNG') || interceptedRequest.url().endsWith('.JPG'))

        //interceptedRequest.abort();   //终止请求

            interceptedRequest.respond({

                status: 200,

                body: Buffer.from(imageBytes)

            });


        else

            interceptedRequest.continue();//弹出

    });


    // 设置浏览器视窗

    /*

     page.setViewport({

     width: 1920,

     height: 1080,

     });

     */

    await page.emulate(iPhone);

    //await page.goto('https://m.toutiaoimg.com/item/6706038150935888391/?app=news_article_lite&timestamp=1563929593&req_id=20190724085313010152028146551F0B7&group_id=6706038150935888391');

    //await page.goto("http://m.gifshow.com/s/U6kK7y0Q");

    await page.goto(process.argv[2]);

    await page

        .mainFrame()

        .addScriptTag({

            url: 'https://cdn.bootcss.com/jquery/3.2.0/jquery.min.js'

        });

    await page.waitFor(2000);


    await page.mouse.move( 126,126 );

    await page.mouse.down();

    await page.mouse.move( 126, 19 );

    await page.mouse.up();


    //const input_text= await page.$("#kw");

    var y=0;

    while(true) {

        var result = await page.evaluate((y) => {

            console.log("加载完毕");

            if (y == 0) {

                var top = 0

                //每200毫秒滚动100px

                var timer = setInterval(() => {

                    console.log(window.scrollY);

                    window.scrollTo(0, top += 100)

                }, 200);

                y = 1;

            }

            /*

             var ii;

             for (ii = y; ii <= y + 1000 * 5; ii += 100) {

             window.scrollTo(0, ii);

             }

             y = ii;

             */

            try {

                var m_ReturnOBJ = {};

                m_ReturnOBJ.sMsg="";

                var m_MVList = [];

                m_ReturnOBJ.y = y;

               // var allcnt = $('div[class="feed-header"]').text();

               // allcnt.replace("作品", "");

               // allcnt = Trim(allcnt);

               // var nn = parseInt(allcnt);


                /*

                 var m_lis=$('li[class="photo "]').find('a');

                 var name=$('div[class="name"]').text();

                 for(var i=0;i<m_lis.length;i++)

                 {

                 var m_one={};

                 m_one.mvurl= m_lis.attr('href');

                 m_one.mvpic= m_lis.attr('data-lazy');

                 m_one.nickname=name;

                 m_one.type="kuaishou";

                 m_ReturnOBJ.push(m_one);

                 }

                 */

                var m_lis = $('img[class="image-main"]');

                var name = $('div[class="user-info-name"]').text();


                var mytext = $('div[class="footer"]').text();

                var bOver = false;

                if (mytext.indexOf("已经到底") != -1) {

                    bOver = true;

                }

                //bOver = true;

                if (bOver) {

                    for (var i = 0; i < m_lis.length; i++) {

                        var m_one = {};

                        m_one.mvurl = "";//m_lis.attr('href');

                        m_one.mvpic = $(m_lis[i]).attr('src');

                        m_one.nickname = name;

                        m_one.type = "kuaishou";

                        m_one.title = "";

                        m_MVList.push(m_one);

                    }

                    m_ReturnOBJ.bOK = true;

                    m_ReturnOBJ.sMgs=mytext;

                }

                else {

                    m_ReturnOBJ.bOK = false;

                }



            m_ReturnOBJ.m_MVList = m_MVList;

        }

            catch(err)

            {

                m_ReturnOBJ.bOK=true;

                m_ReturnOBJ.sMsg=err.message;

            }

            return m_ReturnOBJ;


        }, y);

        //console.log(result);

        if (result.bOK) {

            returnCrawler(result.m_MVList);//m_MVList);

            break;

        }

        else {

            y = result.y;


        }

    }

    await page.screenshot({path: 'kuaishou.png'});

    browser.close();

})();



Puppeteer性能优化与执行速度提升

我们需要找到下面几种配置来提升速度:


如果将Dom解析和渲染放到同一进程,肯定能提升时间(进程上下文切换的时间)。对应的配置是 single-process

部分功能disable掉,比如GPU、Sandbox、插件等,减少内存的使用和相关计算。

如果启动Chromium时能绑定到某个CPU核上也能提升速度(单核上进行进程切换耗费的时间更少)。可惜没有找到对应的配置,官方文档写的是Chromium启动时会自动绑定CPU大核(ARM架构的CPU通常有大小核之分),依此推测Chromium启动时是会绑核的。(此处我并未验证)

最后配置如下:


const browser = await puppeteer.launch(

{

    headless:true,

    args: [

        ‘–disable-gpu’,

        ‘–disable-dev-shm-usage’,

        ‘–disable-setuid-sandbox’,

        ‘–no-first-run’,

        ‘–no-sandbox’,

        ‘–no-zygote’,

        ‘–single-process’

    ]

});

Chromium 启动参数列表 文档中的配置项都可以尝试看看,我没有对所有选项做测试,但可以肯定存在某些选项能提升Chromium速度。


优化Chromium执行流程

接下来我们再单独优化Chromium对应的页面。我之前的文章中提过,如果每次请求都启动Chromium,再打开tab页,请求结束后再关闭tab页与浏览器。流程大致如下:


请求到达->启动Chromium->打开tab页->运行代码->关闭tab页->关闭Chromium->返回数据


真正运行代码的只是tab页面,理论上启动一个Chromium程序能运行成千上万的tab页,可不可以复用Chromium每次只打开一个tab页然后关闭呢?当然是可以的,Puppeteer提供了puppeteer.connect() 方法,可以连接到当前打开的浏览器。流程如下:


请求到达->连接Chromium->打开tab页->运行代码->关闭tab页->返回数据


代码如下:


const MAX_WSE = 4;  //启动几个浏览器 

let WSE_LIST = []; //存储browserWSEndpoint列表

init();

app.get('/', function (req, res) {

    let tmp = Math.floor(Math.random()* MAX_WSE);

    (async () => {

        let browserWSEndpoint = WSE_LIST[tmp];

        const browser = await puppeteer.connect({browserWSEndpoint});

        const page = await browser.newPage();

        await page.goto('file://code/screen/index.html');

        await page.setViewport({

            width: 600,

            height: 400

        });                

        await page.screenshot({path: 'example.png'});

        await page.close();

        res.send('Hello World!');

    })();

});


function init(){

    (async () => {

        for(var i=0;i<MAX_WSE;i++){

            const browser = await puppeteer.launch({headless:true,

                args: [

                '--disable-gpu',

                '--disable-dev-shm-usage',

                '--disable-setuid-sandbox',

                '--no-first-run',

                '--no-sandbox',

                '--no-zygote',

                '--single-process'

            ]});

            browserWSEndpoint = await browser.wsEndpoint();

            WSE_LIST[i] = browserWSEndpoint;

        }

        console.log(WSE_LIST);

    })();        

}

利用cluster优化Puppeteer

通常情况下我们会使用 .map() 搭配 Promise.all() 的方式并行处理异步,但是在使用Puppeteer批量截图时发现Promise.all会打开多个浏览器,导致机器性能急剧下降。


Promise.all() 并行处理


image

利用 Reduce 是多个Promise顺序执行


await tasks.reduce((sequence, url, idx) => {

  return sequence.then(() => {

    // doAnalyze 是个异步函数

    return doAnalyze(url, idx);

  });

}, Promise.resolve())

场景:有40个URL,需要获取每个博客的首页截图


如果是Promise.all(),程序启动会同时打开20+的chromium浏览器,导致机器卡死。

使用reduce缓解了压力,但没充分利用多核性能

参入Cluster

// cluster_index.js 入口文件

const cluster = require('cluster');


(async () => {

  let run;

  if (cluster.isMaster) {

    run = require('./cluster_master');

  } else {

    run = require('./cluster_worker');

  }

  try {

    await run();

  } catch (e) {

    // 追踪函数的调用轨迹

    console.trace(e);

  }

})();

// cluster_master.js master进程分配任务


const cluster = require('cluster');

const numCPUs = require('os').cpus().length;


// 处理的任务列表

let arr = [

  'https://github.com/guoguoya',

  'http://www.52cik.com',

  'http://zhalice.com',

  'https://www.yzqroom.cn',

  'http://zxh.name',

  'https://fogdong.github.io/',

  'http://github.com/elsieyin',

  'https://summer.tlb058.com',

  'https://skymon4.cn',

  'http://www.jiweiqing.cn',

  'http://effect.im',

  'http://dingkewz.com',

  'http://xcdh.me',

  'http://d2g.io',

  'http://codingdemon.com',

  'http://blog.leanote.com/dujuncheng',

  'http://niexiaotao.com',

  'http://zhengchengwen.com',

  'http://blog.tophefei.com',

  'https://zh-rocco.github.io',

  'http://wangyn.net',

  'http://dscdtc.ml',

  'http://jweboy.github.io',

  'http://www.wenghaoping.com',

  'http://zhoujingchao.github.io',

  'http://kyriejoshua.github.io/jo.github.io/',

  'http://www.withyoufriends.com',

  'http://if2er.com',

  'https://github.com/zhou-yg',

  'http://github/suoutsky',

  'http://richardsleet.github.io',

  'http://www.89io.com',

  'https://guoshencheng.com',

  'http://www.landluck.com.cn',

  'http://www.89io.com',

  'http://myoungxue.top',

  'https://github.com/Wangszzju',

  'http://www.hacke2.cn',

  'https://github.com/enochjs',

  'https://i.jakeyu.top',

  'http://muyunyun.cn',

];


module.exports = async () => {

  // 每个 CPU 分配 N 个任务

  const n = Math.floor(arr.length / numCPUs);

  // 未分配的余数

  const remainder = arr.length % numCPUs;


  for (let i = 1; i <= numCPUs; i += 1) {

    const tasks = arr.splice(0, n + (i > remainder ? 0 : 1));

    // 将任务编号传递到 Cluster 内启动

    cluster.fork({ tasks: JSON.stringify(tasks) });

  }

  cluster.on('exit', (worker) => {

    console.log(`worker #${worker.id} PID:${worker.process.pid} died`);

  });

  cluster.on('error', (err) => {

    console.log(`worker #${worker.id} PID ERROR: `, err);

  });

};

// cluster_worker.js worker进程 完成任务


const cluster = require('cluster');

const puppeteer = require('puppeteer');


// 禁止直接启动

if (cluster.isMaster) {

  console.log('----', cluster.worker.id)

  process.exit(0);

}


module.exports = async () => {

  const env = process.env.tasks;

  let tasks = [];

  if (/^\[.*\]$/.test(env)) {

    tasks = JSON.parse(env);

  }

  if (tasks.length === 0) {

    console.log('????', tasks)

    // 非法启动, 释放进程资源

    process.exit(0);

  }

  console.log(`worker #${cluster.worker.id} PID:${process.pid} Start`);

  await tasks.reduce((sequence, url, idx) => {

    return sequence.then(() => {

      return doAnalyze(url, idx);

    });

  }, Promise.resolve())


  console.log(cluster.worker.id + ' 顺利完成');

  process.exit(0);

};


async function doAnalyze(url, i) {

  try {

    const browser = await (puppeteer.launch({

      // 若是手动下载的chromium需要指定chromium地址, 默认引用地址为 /项目目录/node_modules/puppeteer/.local-chromium/

      // executablePath: '/Users/huqiyang/Documents/project/z/chromium/Chromium.app/Contents/MacOS/Chromium',

      //设置超时时间

      timeout: 30000,

      //如果是访问https页面 此属性会忽略https错误

      ignoreHTTPSErrors: true,

      // 打开开发者工具, 当此值为true时, headless总为false

      devtools: false,

      // 关闭headless模式, 会打开浏览器

      headless: false

    }));

    const page = await browser.newPage();

    await page.setViewport({width: 1920, height: 1080});

    await page.goto(url);

    await page.waitFor(4000);

    console.log(cluster.worker.id, url, i, '截图中...');

    await page.screenshot({

      path: `./img_cluster/${cluster.worker.id}-${i}.png`,

      // path: '3.png',

      type: 'png',

      // quality: 100, 只对jpg有效

      // fullPage: true,

      // 指定区域截图,clip和fullPage两者只能设置一个

      // clip: {

      //   x: 0,

      //   y: 0,

      //   width: 1920,

      //   height: 600

      // }

    });

    browser.close();

  } catch (error) {

    console.log(cluster.worker.id, url, i)

    console.log(error)

  }

};

多个page轮询与多个browser轮询

为了性能,现有解决方案是初始化若干个browser,请求打过来时,直接在browserList中取一个browser实例使用。

作为对比,可以参考初始化一个browser,预先打开若干个page,请求打过来时,直接在pageList中取一个page实例使用。



node puppeteer拦截谷歌请求、设置浏览器响应-爬取电子书链接

一、拦截谷歌请求:

(1)拦截谷歌请求,使用后会激活会激活 request.abort, request.continue 和 request.respond 方法。

await page.setRequestInterception(true);

(2)监听request事件,对请求做出操作

一旦启用请求拦截,每个请求都将停止,除非它继续,响应或中止

page.on('request',回调函数(request形参,包含上述方法)


如拦截谷歌广告:

//截取谷歌请求

await page.setRequestInterception(true);

//监听每一次请求,形参为请求对象

page.on('request',(interceptedRequest)=>{

//ite.url()获取请求url地址

let urlObj=Url.parse(interceptedRequest.url());

//如果是谷歌的广告

if(urlObj.hostname=='googleads.g.doubleclick.net'或者urlObj.hostname.indexOf('google')!=-1){

//拦截请求

interceptedRequest.abort();

}else{

interceptedRequest.continue();

}

})


二、如果爬取时间超时导致失败:


(1)通过延迟函数,将延迟时间增长

(2)使用page.setDefaultNavigationTimeout(0); 将浏览器响应时间改为无限长,默认为30秒

(3)在配置信息中设置,timeout=0; 效果和(2)一样


三、获取资源后关闭页面,减小性能消耗


page.close();


四、通过page.$/$$()返回的ElementHandle 获取元素属性


let xx=await page.$('选择器')

let xxx=await xx.getProperty('属性');

let xxxx=xxx._remoteObject.value;

代码示例:

let puppeteer=require('puppeteer');

let axios =require('axios');

let Url=require('url');

let fs=require('fs');


let http='https://sobooks.cc/';

async function run(){


function wait(time){

return new Promise(function(resolve,reject){

setTimeout(function(){

resolve('ok延迟')

})

},time)

}



let options={

headless:false,

slowMo:250,

defaultViewport:{

width:1000,

height:800

}

}


let browser=await puppeteer.launch(options);


//获取所有页数

async function getAllNum(){

let page=await browser.newPage();


//截取谷歌广告请求

await page.setRequestInterception(true);

//监听每一次请求,形参为请求对象

page.on('request',(interceptedRequest)=>{

//ite.url()获取请求url地址

let urlObj=Url.parse(interceptedRequest.url());

if(urlObj.hostname=='googleads.g.doubleclick.net'){

//拦截请求

interceptedRequest.abort();

}else{

interceptedRequest.continue();

}

})



await page.goto(http);

let pageNum=await page.$eval('.pagination li:last-child span',(ele)=>{

//获得页数

let num=ele.innerText.substring(2,length-2);

return num.trim();

})

//返回内容后关掉当前页面,节省空间

page.close();

return pageNum;

}


let pageNum=await getAllNum();


//获取指定页的所有书籍链接

async function pageList(num){

let listUrl='https://sobooks.cc/page/'+num;

let page=await browser.newPage();

//设置浏览器响应时间为无限制等待,默认为30秒

page.setDefaultNavigationTimeout(0);

//截取谷歌请求

await page.setRequestInterception(true);

//监听每一次请求,形参为请求对象

page.on('request',(interceptedRequest)=>{

//ite.url()获取请求url地址

let urlObj=Url.parse(interceptedRequest.url());

if(urlObj.hostname=='googleads.g.doubleclick.net'){

//拦截请求

interceptedRequest.abort();

}else{

interceptedRequest.continue();

}

})



await page.goto(listUrl);


let arr= await page.$$eval('.card .card-item .thumb-img>a',(ele)=>{

let arr=[];

ele.forEach(function(item,index){

let obj={

href:item.getAttribute('href'),

title:item.getAttribute('title')

};

arr.push(obj);

})

// console.log(arr);

return arr;

})


//获取结束后关闭页面

page.close();


//遍历内容,去到书籍的详情页面,获取网盘链接

arr.forEach(async (item,index)=>{

//延迟函数,延迟打开

await wait(300*index);

getPageInfo(item);

})

}


//根据书籍地址,去获取网盘链接

async function getPageInfo(pageObj){

let page=await browser.newPage();


//截取谷歌请求

await page.setRequestInterception(true);

//监听每一次请求,形参为请求对象

page.on('request',(interceptedRequest)=>{

//ite.url()获取请求url地址

let urlObj=Url.parse(interceptedRequest.url());

if(urlObj.hostname=='googleads.g.doubleclick.net'){

//拦截请求

interceptedRequest.abort();

}else{

interceptedRequest.continue();

}

})




await page.goto(pageObj.href);

let eleA=await page.$('.dltable tr:nth-child(3) a:last-child')

//page.$()方式,获取元素属性的方法

let url=await eleA.getProperty('href');

url=url._remoteObject.value;


//解析地址,获取网盘链接

url=url.split('?url=')[1]+'\n';

let content={

title:pageObj.title,

url:url

}

//存放地址

console.log(pageObj.title);

fs.writeFile("C:/Users/10853/Desktop/爬虫电子书.txt",content.title+','+content.url,{flag:'a'},function(err){

console.log('ok');

})

}


pageList(1);

}


run();












上一篇:实用代码下载

下一篇:NodeJS同行者

Top