Selenium爬取前程无忧51job招聘信息

import csv
import random
import time
from lxml import etree
from selenium import webdriver
#实现规避检测
from selenium.webdriver import ChromeOptions
#实现无可视化界面的
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By#去除浏览器识别,去除Chrome正在受到自动检测软件的控制
chrome_options = ChromeOptions()
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
chrome_options.add_experimental_option('detach', True)
chrome_options.add_argument("--disable-blink-features=AutomationControlled")options = Options()
options.add_argument('--headless')  # 设置为无头
options.add_argument('--disable-gpu')  # 设置没有使用gpu
options.add_argument('user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36')#实例化一个浏览器对象（传入浏览器的驱动程序）
web = webdriver.Chrome(chrome_options=chrome_options, options=options)
with open('./stealth.min.js') as f:js = f.read()
web.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {"source": js
})def modify_detail(job_details):job_details = job_details[1:]job_details = [x.strip() for x in job_details if x.strip() != '']# print('----',job_details)# print('mylist列表长度',len(job_details))new1_list = []new2_list = []for k in range(len(job_details)):# print('---',job_details[k])if '任职资格' in job_details[k]:new2_list = job_details[k+1:]new1_list = job_details[:k]elif  '职位要求' in job_details[k]:new2_list = job_details[k+1:]new1_list = job_details[:k]elif  'Note' in job_details[k]:new2_list = job_details[k+1:]new1_list = job_details[:k]elif  '任职要求' in job_details[k]:new2_list = job_details[k+1:]new1_list = job_details[:k]elif  '岗位任职条件' in job_details[k]:new2_list = job_details[k+1:]new1_list = job_details[:k]elif  '岗位要求' in job_details[k]:new2_list = job_details[k+1:]new1_list = job_details[:k]elif  '招聘条件' in job_details[k]:new2_list = job_details[k+1:]new1_list = job_details[:k]elif  '应聘要求' in job_details[k]:new2_list = job_details[k+1:]new1_list = job_details[:k]# else:#     new1_list = job_detailsprint(new1_list)print(new2_list)return new1_list, new2_list#利用xpath和css选择器提取数据
f = open('金融招聘.csv', mode='a', encoding='utf-8',newline='')
csv_write = csv.DictWriter(f,fieldnames=['岗位名称','薪资','公司名称','公司规模','所属行业','工作地点','工作经验','学历要求','关键词','发布日期','招聘详情','岗位职责','任职资格'# '上班地址',# '联系方式'
])
csv_write.writeheader()  #写入表头#找到职位框，输入搜索的内容
job_names = ['金融科技','区块链金融','量化投资','金融大数据分析','金融风险分析','金融产品用户运营','金融用户体验']
for j in job_names:# 定位输入框并查找相关职位# 发起请求web.get("https://www.51job.com/")time.sleep(5)  # 防止加载缓慢，休眠2秒web.find_element(By.XPATH, '//*[@id="kwdselectid"]').click()web.find_element(By.XPATH, '//*[@id="kwdselectid"]').clear()# 输入招聘岗位名称web.find_element(By.XPATH, '//*[@id="kwdselectid"]').send_keys(j)web.find_element(By.XPATH, '/html/body/div[3]/div/div[1]/div/button').click()#点击搜索time.sleep(10)for page in range(1,50):print(f'==============正在爬取{page}页信息==================')time.sleep(10)web.find_element(By.XPATH, '//*[@id="jump_page"]').click()time.sleep(random.randint(10, 30) * 0.1)web.find_element(By.XPATH, '//*[@id="jump_page"]').clear()time.sleep(random.randint(10, 40) * 0.1)web.find_element(By.XPATH, '//*[@id="jump_page"]').send_keys(page)time.sleep(random.randint(10, 30) * 0.1)web.find_element(By.XPATH, '//*[@id="app"]/div/div[2]/div/div/div[2]/div/div[2]/div/div[3]/div/div/span[3]').click()#定位招聘页面所有招聘公司time.sleep(5)jobData = web.find_elements(By.XPATH, '//*[@id="app"]/div/div[2]/div/div/div[2]/div/div[2]/div/div[2]/div[1]/div')#print(jobData)#详情页面工作职责和任职资格划分for job in jobData:jobName = job.find_element(By.CLASS_NAME, 'jname.at').text                   #职位名称time.sleep(random.randint(5, 15) * 0.1)jobSalary = job.find_element(By.CLASS_NAME, 'sal').text                      #工作薪资time.sleep(random.randint(5, 15) * 0.1)jobCompany = job.find_element(By.CLASS_NAME, 'cname.at').text                #公司名称time.sleep(random.randint(5, 15) * 0.1)company_type_size = job.find_element(By.CLASS_NAME, 'dc.at').text            #公司规模time.sleep(random.randint(5, 15) * 0.1)company_status = job.find_element(By.CLASS_NAME, 'int.at').text              #所属行业time.sleep(random.randint(5, 15) * 0.1)address_experience_education = job.find_element(By.CLASS_NAME, 'd.at').text  #地点_经验_学历print(address_experience_education)length = len(address_experience_education.split('|'))if length == 3:address = address_experience_education.split('|')[0]            #工作地点experience = address_experience_education.split('|')[1]         #工作经验edu = address_experience_education.split('|')[2]                #学历要求else:address = address_experience_education.split('|')[0]  # 工作地点experience = '无需经验'edu = '学历不限'time.sleep(random.randint(5, 15) * 0.1)try:job_welf = job.find_element(By.CLASS_NAME, 'tags').get_attribute('title')  #关键词except:job_welf = '无数据'time.sleep(random.randint(5, 15) * 0.1)update_date = job.find_element(By.CLASS_NAME, 'time').text             #发布日期time.sleep(random.randint(5, 15) * 0.1)job_href = job.find_element(By.CLASS_NAME, 'el').get_attribute('href') #招聘详情#print(job_href)#获取当前窗口句柄Ahandle = web.current_window_handletry:job.click()# 打开招聘详情页except:continue  #如果当前招聘详情页打不开, 跳过当前招聘公司#获得当前所有窗口句柄(窗口A、B)handles = web.window_handlestime.sleep(5)    # 防止加载缓慢，休眠5秒for newhandle in handles:#筛选新打开的窗口Bif newhandle != handle:web.switch_to.window(newhandle)#切换到新打开的窗口Bd_html = web.page_sourcehtml = etree.HTML(d_html)# dt = html.xpath('//div[@class=bmsg.job_msg.inbox]')job_details = html.xpath('/html/body/div[2]/div[2]/div[3]/div[1]/div/text()')##详情页面工作职责和任职资格划分job_res, job_zige = modify_detail(job_details)web.close()dit = {"岗位名称": jobName,"公司名称": jobCompany,"薪资": jobSalary,"公司规模": company_type_size,"所属行业": company_status,"工作地点": address,"工作经验":experience,"学历要求":edu,"关键词": job_welf,"发布日期": update_date,"招聘详情": job_href,"岗位职责": job_res,"任职资格": job_zige}print(f'正在爬取{jobCompany}公司')print(jobName, jobSalary, jobCompany, company_type_size, company_status, address, experience, edu,job_welf,update_date,job_href,job_res,job_zige)csv_write.writerow(dit)web.switch_to.window(handles[0])time.sleep(10)

Selenium爬取前程无忧51job招聘信息相关推荐

python+selenium爬取智联招聘信息
python+selenium爬取智联招聘信息需求准备代码结果需求老板给了我一份公司名单(大概几百家如下图),让我到网上看看这些公司分别在招聘哪些岗位,通过分析他们的招聘需求大致能推断出我 ...
一键爬取前程无忧51job招聘网，从此毕设数据不用愁
此次爬取代码以关键词开发工程师为例,爬取无筛选范围内的岗位数据: 查看网页源码:通过对网站发送申请,获取响应数据,进行网页分析,确定数据所在位置.注意这里不能直接定位标签,数据在java-script ...
爬取前程无忧51job（动态数据）
爬取前程无忧51job网上全国"python"关键字所对应的岗位招聘信息利用Requests和正则表达式方法,爬取前程无忧51job网站上全国"爬虫"关键字对 ...
爬虫项目 | 爬取XX网站招聘信息
/***本人代码小白 ,第一次做爬虫,代码仅供参考,欢迎大神指点,***/ 项目背景和功能毕业将近,身为大三的我们,面临找工作的压力,如何快速的找到自己心仪的岗位并且及时投递简历成为同学们关心的问题 ...
利用Scrapy框架爬取前途无忧招聘信息
利用Scrapy框架爬取前途无忧招聘信息关于安装和命令使用可参考:https://docs.scrapy.org/en/1.7/intro/install.html 先创建项目和爬虫文件分析网站 ...
利用Selenium爬取淘宝商品信息
文章来源:公众号-智能化IT系统. 一. Selenium和PhantomJS介绍 Selenium是一个用于Web应用程序测试的工具,Selenium直接运行在浏览器中,就像真正的用户在操作一样. ...
(转)python爬虫实例——爬取智联招聘信息
受友人所托,写了一个爬取智联招聘信息的爬虫,与大家分享. 本文将介绍如何实现该爬虫. 目录网页分析实现代码分析结果总结 github代码地址网页分析以https://xiaoyuan.zh ...
python爬虫实例——爬取智联招聘信息
受友人所托,写了一个爬取智联招聘信息的爬虫,与大家分享. 本文将介绍如何实现该爬虫. 目录网页分析实现代码分析结果总结 github代码地址网页分析以https://xiaoyuan.zh ...
python爬虫——用selenium爬取淘宝商品信息
python爬虫--用selenium爬取淘宝商品信息 1.附上效果图 2.淘宝网址https://www.taobao.com/ 3.先写好头部 browser = webdriver.Chrome ...

Selenium爬取前程无忧51job招聘信息

Selenium爬取前程无忧51job招聘信息相关推荐

最新文章

热门文章