每天学习一点点,成功增加一大步

NodeJs笔记:puppeteer爬虫

NodsJS zhanghui 1165℃

以前在用 php 写后端时也曾写过 php 的爬虫,现在使用 nodeJs 来试着写爬虫。而 puppeteer 则可以模拟浏览器来浏览,在这里就做了两个实例,第1个实例是网页页面保存为图片,第2个实例通过搜索将搜索出现的图片列表保存下来。

实验工具

  • 语言: typeScript
  • 依赖库:puppeteer

puppeteer 的安装:

npm i --save puppeteer

手册:https://github.com/GoogleChrome/puppeteer

实例:爬百度图片

import * as puppeteer from 'puppeteer';
import * as path from 'path';

const pathScreenshot = path.join(__dirname, 'screenshot');

(async ()=>{
    const browser = await puppeteer.launch();
    const page = await browser.newPage();
    await page.goto('http://www.baidu.com');
    await page.screenshot({path: `${pathScreenshot}/${Date.now()}.png`});

    await browser.close();
})();

 

实例:抓取搜索图片列表

import * as puppeteer from 'puppeteer';
import * as path from 'path';
import * as http from 'http';
import * as https from 'https';
import * as fs from 'fs';
import {promisify} from "util";

const writeFile = promisify(fs.writeFile);

const url = 'http://image.baidu.com/';
const mnPath = path.join(__dirname, 'mn');
// fs.exists(mnPath) || fs.mkdir(mnPath);

(async () => {
    const browser = await puppeteer.launch();
    const page = await browser.newPage();
    await page.goto(url);
    console.log(`go to ${url}`);
    // 改变浏览器窗口大小
    await page.setViewport({
        width: 1920, height: 1080
    });
    console.log('reset viewport');
    // 找到搜索输入框的焦点
    await page.focus('#kw');
    // 模拟键盘向输入框输入的过程
    await page.keyboard.sendCharacter('狗');
    // 触发按钮的点击
    // await page.click('.s_btn');
    await page.click('.s_search');
    console.log('go to search list');
    // 等待页面加载完成
    page.on('load', async () => {
        console.log('page loading done start fetch...');
        // 获取页面上所有图片的 src
        const srcs = await page.evaluate(() => {
            const images = document.querySelectorAll('img.main_img');
            return Array.prototype.map.call(images, img => img.src);
        });
        console.log(`get ${srcs.length} images, start download`);
        // 遍历 srcs 保存所有图片
        srcs.forEach(async (src) => {
            // 等待某个元素的出现,出现了就往下执行
            await page.waitFor(200);
            await srcToImg(src, mnPath);
        });
        await browser.close();
    });
})();

const srcToImg = async (src, dir) => {
    console.log(src);
    if (/\.(jpg|png|gif)$/.test(src)) {
        await urlToImg(src, dir);
    } else {
        await base64ToImg(src, dir);
    }
}

// url => image
const urlToImg = async (url, dir)=> {
    // 分析是否是 https 协议
    const mod = /^https:/.test(url) ? https : http;
    const ext = path.extname(url);
    const file = path.join(dir, `${Date.now()}${ext}`);
    mod.get(url, res=>{
        res.pipe(fs.createWriteStream(file))
            .on('finish', ()=>{
                console.log(file);
            });
    });
};

// base64 => image
const base64ToImg = async (base64Str,dir)=>{
    const matches = base64Str.match(/^data:(.+?);base64,(.+)$/);
    try {
        const ext = matches[1].split('/')[1].replace('jpeg','jpg');
        const file = path.join(dir, `${Date.now()}.${ext}`);
        await writeFile(file, matches[2], 'base64');
        console.log(file);
    } catch (ex) {
        console.log('非法 base64');
    }
};

 

转载请注明:隨習筆記 » NodeJs笔记:puppeteer爬虫