以前在用 php 写后端时也曾写过 php 的爬虫,现在使用 nodeJs 来试着写爬虫。而 puppeteer 则可以模拟浏览器来浏览,在这里就做了两个实例,第1个实例是网页页面保存为图片,第2个实例通过搜索将搜索出现的图片列表保存下来。
实验工具
- 语言: typeScript
- 依赖库:puppeteer
puppeteer 的安装
npm i --save puppeteer
实例:爬百度图片
import * as puppeteer from 'puppeteer';
import * as path from 'path';
const pathScreenshot = path.join(__dirname, 'screenshot');
(async ()=>{
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('http://www.baidu.com');
await page.screenshot({path: `${pathScreenshot}/${Date.now()}.png`});
await browser.close();
})();
实例:抓取搜索图片列表
import * as puppeteer from 'puppeteer';
import * as path from 'path';
import * as http from 'http';
import * as https from 'https';
import * as fs from 'fs';
import {promisify} from "util";
const writeFile = promisify(fs.writeFile);
const url = 'http://image.baidu.com/';
const mnPath = path.join(__dirname, 'mn');
// fs.exists(mnPath) || fs.mkdir(mnPath);
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(url);
console.log(`go to ${url}`);
// 改变浏览器窗口大小
await page.setViewport({
width: 1920, height: 1080
});
console.log('reset viewport');
// 找到搜索输入框的焦点
await page.focus('#kw');
// 模拟键盘向输入框输入的过程
await page.keyboard.sendCharacter('狗');
// 触发按钮的点击
// await page.click('.s_btn');
await page.click('.s_search');
console.log('go to search list');
// 等待页面加载完成
page.on('load', async () => {
console.log('page loading done start fetch...');
// 获取页面上所有图片的 src
const srcs = await page.evaluate(() => {
const images = document.querySelectorAll('img.main_img');
return Array.prototype.map.call(images, img => img.src);
});
console.log(`get ${srcs.length} images, start download`);
// 遍历 srcs 保存所有图片
srcs.forEach(async (src) => {
// 等待某个元素的出现,出现了就往下执行
await page.waitFor(200);
await srcToImg(src, mnPath);
});
await browser.close();
});
})();
const srcToImg = async (src, dir) => {
console.log(src);
if (/\.(jpg|png|gif)$/.test(src)) {
await urlToImg(src, dir);
} else {
await base64ToImg(src, dir);
}
}
// url => image
const urlToImg = async (url, dir)=> {
// 分析是否是 https 协议
const mod = /^https:/.test(url) ? https : http;
const ext = path.extname(url);
const file = path.join(dir, `${Date.now()}${ext}`);
mod.get(url, res=>{
res.pipe(fs.createWriteStream(file))
.on('finish', ()=>{
console.log(file);
});
});
};
// base64 => image
const base64ToImg = async (base64Str,dir)=>{
const matches = base64Str.match(/^data:(.+?);base64,(.+)$/);
try {
const ext = matches[1].split('/')[1].replace('jpeg','jpg');
const file = path.join(dir, `${Date.now()}.${ext}`);
await writeFile(file, matches[2], 'base64');
console.log(file);
} catch (ex) {
console.log('非法 base64');
}
};
GITHUB
https://github.com/GoogleChrome/puppeteer
中文翻译:
https://zhaoqize.github.io/puppeteer-api-zh_CN/#/
转载请注明:隨習筆記 » NodeJs笔记:puppeteer爬虫