爬虫： cheerio爬取网页中的所有图片

climbThePage.js

// （下载网页中的图片）// 用于发送http请求
const https = require('https')
const http = require('http')
// 用于提取网页中的img标签
const cheerio = require('cheerio')
// 用于将http响应中的数据写到文件中
const fs = require('fs')
// 用于获取系统文件分隔符
const path = require('path')
const sep = path.sep
// 用于存储图片和网页的文件夹路径
const imgDir = `${__dirname}${sep}imgs${sep}`
const pageDir = `${__dirname}${sep}pages${sep}`// https协议名
const HTTPS = 'https:'
// 若文件夹不存在则创建
for (const dir of [imgDir, pageDir]) {if (!fs.existsSync(dir)) {// console.log('文件夹(%s)不存在,即将为您创建', dir)fs.mkdirSync(dir)}
}// 0. node爬取页面html，获取标签src 并获取图片
function crteatedImg (callback) {// const url = 'http://gee2dan.com/'const url = 'https://www.chsi.com.cn/xlcx/lscx/queryinfo.do'// 下载中的图片数量let downloadingCount = 0let fileNameStr = ''downloadImgsOn(url)// 下载指定网站包含的图片function downloadImgsOn (url) {// URL作为optionsconst options = new URL(url);// 获取协议const protocol = options.protocol// 根据协议选择发送请求的模块const _http = protocol === HTTPS ? https : http// 发送请求const req = _http.request(options, (res) => {// 用于存储返回的html数据let htmlData = ''res.on('data', (chunk) => {htmlData += chunk.toString('utf8')})res.on('end', () => {// 将html数据存储到文件中,可用于人工校验const htmlFileName = `${pageDir}result.html`fs.writeFile(htmlFileName, htmlData, () => {// console.log('页面(%s)读取完毕,已保存至(%s)', url, htmlFileName)})// 将html信息转换为类jq对象const $ = cheerio.load(htmlData)const imgs = $('img')// 用于保存需要下载的图片url,去除重复的图片urlconst imgUrlSet = new Set()imgs.each((index, img) => {// 获取图片urllet imgUrl = img.attribs.src// 将不完整的图片url转完成完整的图片urlif (imgUrl.startsWith('//')) {imgUrl = protocol + imgUrl} else if (imgUrl.startsWith('/')) {imgUrl = url + imgUrl}imgUrlSet.add(imgUrl)})// console.log('获取图片url共%s个', imgUrlSet.size)// 下载imgUrlSet中包含的图片sfor (const imgUrl of imgUrlSet) {downloadImg(imgUrl)}})})req.on('error', (err) => {console.error(err)})req.end();}/*** 打印当前正在下载的图片数*/function printDownloadingCount () {// console.log('当前下载中的图片有%s个', downloadingCount)}/*** 下载指定url对应的图片* @param {*} imgUrl 目标图片url* @param {*} maxRetry 下载失败重试次数* @param {*} timeout 超时时间毫秒数*/function downloadImg (imgUrl, maxRetry = 10, timeout = 10000) {/*** 用于下载失败后重试*/function retry () {if (maxRetry) {// console.log('(%s)剩余重试次数:%s,即将重试', imgUrl, maxRetry);downloadImg(imgUrl, maxRetry - 1);} else {// console.log('(%s)下载彻底失败', imgUrl)}}// URL作为optionsconst options = new URL('https://www.chsi.com.cn/xlcx/lscx/' + imgUrl);// 根据协议选择发送请求的模块const _http = options.protocol === HTTPS ? https : http// 从url中提取文件名const matches = imgUrl.match(/(?<=.*\/)[^\/\?]+(?=\?|$)/)const fileName = matches && matches[0]// 请求关闭时是否需要重新请求let retryFlag = falseconst req = _http.request(options, (res) => {// console.log('开始下载图片(%s)', 'https://www.chsi.com.cn/xlcx/lscx/' + imgUrl)downloadingCount += 1printDownloadingCount()// 判断数据是否为图片类型,仅保存图片类型const contentType = res.headers['content-type']if (contentType.startsWith('image')) {// 存储图片数据到内存中const chunks = []res.on('data', (chunk) => {chunks.push(chunk)})// req.on('abort') 中相同的操作也可以写在 res.on('aborted') 中// res.on('aborted', () => {})res.on('end', () => {downloadingCount -= 1printDownloadingCount()// 若响应正常结束,将内存中的数据写入到文件中if (res.complete) {// console.log('图片(%s)下载完成', imgUrl)// 生成图片fileNameStr = Date.now() + '.jpg'write(imgDir + fileNameStr, chunks, 0)} else {// console.log('(%s)下载结束但未完成', imgUrl)}})}})req.on('error', (err) => {console.error(err)retryFlag = true})req.on('abort', () => {// console.log('下载(%s)被中断', imgUrl)retryFlag = true})req.on('close', () => {if (retryFlag) {retry()}})// 如果超时则中止当前请求req.setTimeout(timeout, () => {// console.log('下载(%s)超时', imgUrl)req.abort()})req.end()}/*** 将数据块数组chunks中第index个数据块写入到distFileName对应文件的末尾* @param {*} distFileName 数据将写入的文件名* @param {*} chunks 图片数据块数组* @param {*} index 写入数据块的索引*/function write (distFileName, chunks, index) {// // console.log('名称', distFileName)if (index === 0) {var i = 0// 判断文件是否重名,若重名则重新生成带序号的文件名let tmpFileName = distFileNamewhile (fs.existsSync(tmpFileName)) {tmpFileName = distFileName.replace(new RegExp(`^(.*?)([^${sep}\\.]+)(\\..*|$)`), `$1$2_${i}$3`)i += 1}distFileName = tmpFileName}// 获取图片数据块依次写入文件const chunk = chunks[index]if (chunk) {// 异步、递归fs.appendFile(distFileName, chunk, () => {write(distFileName, chunks, index + 1)})} else {// console.log('文件(%s)写入完毕', distFileName)// 验证码图片才会真正保存下来callback && callback(distFileName)console.log('名称', distFileName)}}}module.exports = {crteatedImg
}

调用

let { crteatedImg } = require('./climbThePage.js')
// 0. node爬取页面html，获取标签src 并获取图片
crteatedImg((fileNameStr) => {console.log('文件名： ', fileNameStr, ')
})

爬虫： cheerio爬取网页中的所有图片相关推荐

【Java爬虫】爬取网页中的内容，提取其中文字
挺乱的,临时存一下 package cn.hanquan.craw;import java.io.FileWriter; import java.io.IOException; import java ...
r语言html爬虫,如何用R语言爬取网页中的表格
今天我们要讲怎么样用R写一个小的爬虫,来爬取网页中的表格.这里的网页指html页面.稍微百度一下大家就可以知道,html是一种高度结构化的文本标记语言.html表格所用的标签是 . 所以我们的思路大概 ...
python爬取图片-Python爬取网页中的图片（搜狗图片）详解
前言最近几天,研究了一下一直很好奇的爬虫算法.这里写一下最近几天的点点心得.下面进入正文: 你可能需要的工作环境: Python 3.6官网下载本地下载我们这里以sogou作为爬取的对象. 首先 ...
python爬虫之爬取网页基础知识及环境配置概括
记:python爬虫是爬取网页数据.统计数据必备的知识体系,当我们想统计某个网页的部分数据时,就需要python爬虫进行网络数据的爬取,英文翻译为 spider 爬虫的核心 1.爬取网页:爬取整个网页 ...
python爬虫实现爬取网页主页信息（html代码）
python爬虫实现爬取网页主页信息(html代码) 1.爬取网站源码 urllib整体介绍: urllib是一个包,收集几个模块来处理网址 urllib.request打开和浏览url中内容 url ...
node.js 爬虫实现爬取网页图片并保存到本地
node.js 爬虫实现爬取网页图片并保存到本地没有废话直接看代码 /*** 请求网站数据* 将数据保存本地文件*/ //不同协议引用不同模块,http https const http = re ...
Python3爬取网页中图片（2021-01-04 14:06:02），附上完整代码
Python爬取网页中图片,附上完整代码文章目录 Python爬取网页中图片,附上完整代码概述完整代码概述批量爬取数据,请遵循robots协议及相关网站协议及说明. 本代码仅供有需要爬取网页 ...
利用python爬虫大量爬取网页图片
最近要进行一类图片的识别,因此需要大量图片,所以我用了python爬虫实现一.爬取某一图片网站主要参考:https://www.cnblogs.com/franklv/p/6829387.html ...
java爬虫-简单爬取网页图片
刚刚接触到"爬虫"这个词的时候是在大一,那时候什么都不明白,但知道了百度.谷歌他们的搜索引擎就是个爬虫. 现在大二.再次燃起对爬虫的热爱,查阅资料,知道常用java.python语 ...

爬虫： cheerio爬取网页中的所有图片

爬虫： cheerio爬取网页中的所有图片相关推荐

最新文章

热门文章