创建爬虫

scrapy startproject boos

cd boos

scrapy gensipder -t crawl zhiping “zhipin.com”

爬虫代码

# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from pa_chong.Scrapy.boos.boos.items import BoosItemclass ZhipingSpider(CrawlSpider):name = 'zhiping'allowed_domains = ['zhipin.com']start_urls = ['https://www.zhipin.com/c100010000/?query=python&page=1']rules = (# 匹配职位列表页url规则Rule(LinkExtractor(allow=r'.+\?query=python&page=\d'), follow=True),# 匹配职位详情页url规则Rule(LinkExtractor(allow=r'.+job_detail/.+~\.html'), callback='parse_job', follow=False),)# 解析职位详情def parse_job(self, response):title = response.xpath('//div[@class="name"]/h1/text()').get().strip()salary = response.xpath('//span[@class="badge"]/text()').get().strip()job_info = response.xpath('//div[@class="job-primary detail-box"]/div{@class="info-primary"]/p//text()').getall()city = job_info[0]work_years = job_info[1]education = job_info[2]company = response.xpath('//div[@class="info-company"]/h3[@class="name"]/a/text()').get().strip()itme = BoosItem(title=title, salary=salary, city=city, work_years=work_years, education=education, company=company)return itme

修改settings.py代码

items.py代码

# -*- coding: utf-8 -*-# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.htmlimport scrapyclass BoosItem(scrapy.Item):  # 爬虫数据模型title = scrapy.Field()salary = scrapy.Field()city = scrapy.Field()work_years = scrapy.Field()education = scrapy.Field()company = scrapy.Field()

pipelins.py代码

# -*- coding: utf-8 -*-# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.htmlfrom scrapy.exporters import JsonLinesItemExporterclass BoosPipeline(object):def __init__(self):self.f = open('jobs.json', 'wb')self.expoter = JsonLinesItemExporter(self.f, ensure_ascii=False)def process_item(self, item, spider):self.expoter.export_item(item)return itemdef close_spider(self, spider):self.f.close()

middlewares.py代码

把Scrapy模块自动生成的中间件删除自己定义

# -*- coding: utf-8 -*-# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.htmlimport random# 随机UserAgent
class UserAgentDownloaderMiddleware(object):USER_AGENTS = ['Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; Acoo Browser 1.98.744; .NET CLR 3.5.30729)','Mozilla/4.0 (compatible; MSIE 7.0; America Online Browser 1.1; Windows NT 5.1; (R1 1.5); .NET CLR 2.0.50727; InfoPath.1)','Mozilla/5.0 (compatible; MSIE 9.0; AOL 9.7; AOLBuild 4343.19; Windows NT 6.1; WOW64; Trident/5.0; FunWebProducts)','Mozilla/5.0 (Windows; U; ; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.8.0','Mozilla/5.0 (X11; U; UNICOS lcLinux; en-US) Gecko/20140730 (KHTML, like Gecko, Safari/419.3) Arora/0.8.0','Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; Avant Browser; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0)','Mozilla/5.0 (Windows; U; WinNT; en; rv:1.0.2) Gecko/20030311 Beonex/0.8.2-stable','Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1b2) Gecko/20060826 BonEcho/2.0b2','Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 1.1.4322; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; Browzar)','Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; XH; rv:8.578.498) fr, Gecko/20121021 Camino/8.723+ (Firefox compatible)','Mozilla/5.0 (Macintosh; U; PPC Mac OS X; en) AppleWebKit/418.8 (KHTML, like Gecko, Safari) Cheshire/1.0.UNOFFICIAL','Mozilla/5.0 (Macintosh; U; PPC Mac OS X; pl-PL; rv:1.0.1) Gecko/20021111 Chimera/0.6','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36','Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.2 (KHTML, like Gecko) ChromePlus/4.0.222.3 Chrome/4.0.222.3 Safari/532.2','Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2.3) Gecko/20100409 Firefox/3.6.3 CometBird/3.6.3','Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.7 (KHTML, like Gecko) Comodo_Dragon/16.1.1.0 Chrome/16.0.912.63 Safari/535.7','Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; SV1; Crazy Browser 9.0.04)','Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36']def process_request(self, request, spider):user_agent = random.choice(self.USER_AGENTS)request.headers['User-Agent'] = user_agent# 设置代理ip
import requests
import json
from pa_chong.Scrapy.boos.boos.models import ProxyModel   # 引入解析代理ip和过期时间的模型：ProxyModel
from twisted.internet.defer import DeferredLock  # 导入设置线程锁的模块# 使用芝麻代理来换IP，这里需要3个方法：
# process_request(self, request, spider)： 在请求发送之前执行
# process_response(self, request, response, spider)： 在数据下载到引擎之前执行
# updata_proxy(self):  更新代理ip的方法
class IPProxyDownloadMiddleware(object):PROXY_URL = '芝麻代理请求获取ip的url'def __init__(self):super(IPProxyDownloadMiddleware, self).__init__()self.current_proxy = None  # 用来接收ProxyModel类的对象self.lock = DeferredLock()def process_request(self, request, spider):# 在请求发送之前 (判断是否代理了ip 以及代理ip是否已经过期了)if 'proxy' not in request.meta or self.current_proxy.is_expiring:# 如果没有设置代理或者代理ip即将过期self.updata_proxy()   # 就调用updata_proxy方法更新代理ip# 获取ProxyModel类解析出来的代理ip赋值给request.meta['proxy']# 这里是设置代理ip的关键地方，所以每次在请求发送之前都要设置，不管之前有没有设置代理、有没有过期request.meta['proxy'] = self.current_proxy.proxydef process_response(self, request, response, spider):# 在数据下载到引擎之前 (判断ip是否被拉黑)if response.status != 200 or 'captoha' in response.url:# 如果返回的状态码不等于200，或者url里面有captoha(需要图片验证码)，说明ip被拉黑了if not self.current_proxy.blacked:self.current_proxy.blacked = True   # 开关：更新代理ip时会用到# not self.current_proxy.blacked就说明它是False，这个时候将其设置成True# 在执行更新代理ip的方法时，会调用ProxyModel类，会将blacked设置成默认值False# 这样做的目的是告诉后面的线程 如果blacked目前是Turn，说明是前面的某个线程设置的，# 并且还没来得及执行更新代理ip的方法，这个时候就不要再执行相对应的代码了，因为已经有一条线程在做了self.updata_proxy()   # 更新代理ipprint('ip：%s 已经被加入黑名单' % self.current_proxy.ip)return request# 来的这里说明这个请求是失败的(返回的不是200或者需要图验证码，被识别为爬虫 拉黑ip了)# 所以这里返回request，让这个请求重新加入到调度中，下次再发送# 如果不返回request，相当于这条请求就废掉了，这条数据就没有被爬取到else:return response# 如果请求正常，就返回response# 如果不返回，这个response就不会被传到爬虫那里去，也就得不到解析def updata_proxy(self):  # 更新代理ip的方法self.lock.acquire()  # 加锁if not self.current_proxy or self.current_proxy.is_expiring or self.current_proxy.blacked:# not self.current_proxy 说明还没设置代理# self.current_proxy.is_expiring=True 说明代理ip即将过期# self.current_proxy.blacked=True 说明是process_response函数设置的开关(说明代理ip过期了)# 三个条件满足一个就去请求获取ip# 这样加个判断可以防止异步的时候多条线程同时执行，过多的去请求获取ipresponse = requests.get(self.PROXY_URL)text = response.textprint('重新获取代理ip', text)result = json.loads(text)  # 通过get请求获取代理ip并通过json.loads解析if len(result) > 0:     # result的长度大于0说明请求成功，返回代理ip数据了data = result['data'][0]   # 将代理ip的数据 赋值给data# 调用ProxyModel类 传入data，把结果赋值给self.current_proxy对象proxy_model = ProxyModel(data)self.current_proxy = proxy_modelself.lock.release()  # 释放锁# 因为Scrapy用的是异步，所以更新代理ip的时候要加锁，要不然会有多个线程同时来请求获取ip，会炸掉，ip蹭蹭的浪费

单独创建一个文件(模型)，用于解析代理ip和判断过期时间
命名为models.py，定义一个ProxyModel类

from datetime import datetime, timedeltaclass ProxyModel(object):# data参数是middlewares.py里获取到的ip 端口 过期时间等(将json数据解析为Python对象后的)def __init__(self, data):self.ip = data['ip']      # ipself.port = data['port']  # 端口self.expire_str = data['expire_time']   # 过期时间# 构造代理ip：http/https://ip:portself.proxy = 'https://{}:{}'.format(self.ip, self.port)self.blacked = False    # 开关：更新代理ip时会用到# 获取代理ip过期时间date_str, time_str = self.expire_str.split(' ')# 请求获取ip返回的过期时间是：年-月-日 时:分:秒# 这里以空格作为分割把年月日和时分秒分割出来，解包方式year, month, day = date_str.split('-')          # 用-分割年月日hour, minute, second = time_str.split(':')      # 用:分割时分秒# 构造过期时间self.expire_time = datetime(year=int(year), month=int(month), day=int(day),hour=int(hour), minute=int(minute), second=int(second))@property  # 判断代理ip是否即将过期的属性，@property是将一个方法变成属性的装饰器def is_expiring(self):now = datetime.now()# 如果代理过期时间减去当前时间小于五秒就返回Turn(认为它要过期了)，否则返回False(还没过期)if (self.expire_time-now) < timedelta(seconds=5):return Trueelse:return False

Scrapy：boos直聘爬虫案例相关推荐

Python爬虫实战，pytesseract模块，Python实现BOOS直聘拉勾网岗位数据可视化
前言利用Python实现BOOS直聘&拉勾网岗位数据可视化.废话不多说. 让我们愉快地开始吧~ 开发工具 Python版本: 3.6.4 相关模块: requests模块 pyspider模 ...
爬虫实战-直聘-爬虫岗位分析
爬虫岗位数据分析一数据集 # boss直聘网址:https://www.zhipin.com/web/geek/job?query=%E7%88%AC%E8%99%AB&city=1000 ...
2020最新BOOS直聘爬取保姆式教程，你值得拥有！
前言来到BOOS直聘搜索python 打开控制台,查看请求发现,页面数据不是动态加载所以直接复制当前页面链接进行爬取,经过多次的爬取之后 ....... 失策失策,以前爬取别的网站从没有这么严格 ...
Boos直聘行业数据获取、json解析
开发一个招聘类的项目,网站需要行业数据目标:将行业数据填充进数据库过程分为两步:1:获取数据 2:解析数据放入数据库 1.获取数据进入boos直聘网站首页红框框住的就是我们要获取的行业数据了 ...
Python使用BeautifulSoup与selenium爬取Boos直聘
Hello,我是普通Gopher,00后男孩,极致的共享主义者,想要成为一个终身学习者.专注于做最通俗易懂的计算机基础知识类公众号.每天推送Golang技术干货,内容起于K8S而不止于K8S,涉及Do ...
“鼓上蚤”带你实战之Boss直聘爬虫
1 . 前言 2 . 数据需求 3 . 分析页面 1前言目前来说,在互联网招聘界有一个"蓬勃生长"的"招聘小巨头",也就是我们这期被爬的第一"男猪脚 ...
使用PlayWright技术实现Boss直聘爬虫
笔者之前使用RPA工具-来也Uibot给公司人事开发了一款岗位薪酬爬虫程序,感觉这种工具还是是给不太懂开发的大众小白用的,不太灵活,也没有强大的技术栈做支撑,后来发现了谷歌的puppeteer技术,非 ...
boos直聘显示服务器繁忙,BOSS 直聘回应服务“崩了”：系统升级已完成，可正常使用...
5 月 25 日午间消息,部分用户反映 BOSS 直聘 App 无法正常使用,#boss 直聘崩了 #冲上热搜.新浪科技了解到,因 BOSS 直聘进行系统升级,导致部分用户无法正常使用 BOSS 直聘 ...
爬虫-反爬一：boss直聘cookies反爬怎么治
文章目录絮叨一下分析开撸 1．新建爬虫 2.分析页面布局 3.步骤 1.设置middlewares以及settings(核心) 2.boss_spider.py 3.item.py 4.运行 5 ...

Scrapy：boos直聘爬虫案例

创建爬虫

爬虫代码

修改settings.py代码

items.py代码

pipelins.py代码

middlewares.py代码

Scrapy：boos直聘爬虫案例相关推荐

最新文章

热门文章