Python之爬虫系列之第一弹

爬取财富500强的内容

from urllib.request import urlopen,Request
from bs4 import BeautifulSoup
import re
url = "http://www.fortunechina.com/fortune500/c/2020-08/10/content_372148.htm"
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36'}
ret=Request(url,headers=headers)
html=urlopen(ret)
bs=BeautifulSoup(html,"html.parser")
tr=bs.find('tbody').find_all('tr')
#listall=[]
for j in tr[0:]:td=j.find_all('td')number=td[0].get_text().strip()names=td[1].get_text().strip()income=td[2].get_text().strip()profit=td[3].get_text().strip()country=td[4].get_text().strip()list_1 = number, income, profit, country, nameslist_2 = '{0:<20}\t{1:<20}\t{2:<20}\t{3:<20}\t{4:<20}'.format(number, income, profit, country, names,chr(12288))print(list_2)

效果图

爬取豆瓣TOP250

from urllib.request import urlopen,Request
from bs4 import BeautifulSoup
import re
for i in range(0,250,25):url="https://movie.douban.com/top250?start={0}&filter=".format(i)headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36'}ret=Request(url,headers=headers)html=urlopen(ret)bs=BeautifulSoup(html,"html.parser")names=bs.findAll('span',{'class':"title"})scores=bs.findAll('span',{'class':"rating_num"})numbers=bs.findAll('em',{'class':""})number_list=[]name_list=[]score_list=[]for name in names:name=name.get_text()if name[1] != '/':name_list.append(name)for number,name,score in zip(numbers,name_list,scores):score=score.get_text()number=number.get_text()print(number,name,score)

效果图

爬取中国大学

from urllib.request import urlopen,Request
from bs4 import BeautifulSoup
import re
url="http://www.shanghairanking.cn/rankings/bcur/2020"
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36'}
ret=Request(url,headers=headers)
html=urlopen(ret)
bs=BeautifulSoup(html,"html.parser")
#tbody-tr-td
tr=bs.find('tbody').find_all('tr')
#listall=[]
for j in tr[0:]:td=j.find_all('td')number=td[0].get_text().strip()college=td[1].get_text().strip()province=td[2].get_text().strip()type=td[3].get_text().strip()score=td[4].get_text().strip()#list_1=[]list_1=number,college,province,type,score#listall.append(list_1)print(number,college,province,type,score)

效果图

爬取中国大学+网站

from urllib.request import urlopen,Request
from bs4 import BeautifulSoup
import re
url="http://www.shanghairanking.cn/rankings/bcur/2020"
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36'}
ret=Request(url,headers=headers)
html=urlopen(ret)
bs=BeautifulSoup(html,"html.parser")
tr=bs.find('tbody').find_all('tr')
#listall=[]
for j in tr[0:]:td=j.find_all('td')number=td[0].get_text().strip()college=td[1].get_text().strip()province=td[2].get_text().strip()type=td[3].get_text().strip()score=td[4].get_text().strip()a=j.find_all('a')wangzhi=a[0].get('href')link='http://www.shanghairanking.cn{0}'.format(wangzhi)list_1=number,college,province,type,score,linkprint(number,college,province,score,link)

效果图

爬取列维坦风景油画

import os
from urllib.request import Request, urlopen
import bs4
from urllib.request import urlretrieve
url='https://www.sohu.com/a/286956359_301394'
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'}
ret = Request(url=url, headers=header)
html = urlopen(ret)
bs = bs4.BeautifulSoup(html, 'html.parser')
pictures=bs.find('article',{'class':"article"}).find_all('img')
i=1
for picture in pictures:picture = picture.attrs['src']dir = os.path.abspath('C:/Users/86186/picture')work_path = os.path.join(dir,'picture{}.jpeg').format(i)i=i+1urlretrieve(picture, work_path)

效果图

爬取前程无忧网

from urllib.request import Request, urlopen
import bs4
import requests
import re
import json
import xlwtworkbook = xlwt.Workbook(encoding='utf-8')
worksheet = workbook.add_sheet('前程无忧招聘信息')
worksheet.write(0, 0, label='工作名称')
worksheet.write(0, 1, label='公司名称')
worksheet.write(0, 2, label='工作地区')
worksheet.write(0, 3, label='公司属性')
worksheet.write(0, 4, label='职位要求')
worksheet.write(0, 5, label='职责要求')
z = 1for x in range(1,6):url=('https://search.51job.com/list/080200,000000,0000,00,9,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2,'+str(x))+'.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'}ret = Request(url=url, headers=header)html = urlopen(ret)bs = bs4.BeautifulSoup(html, 'html.parser')names=bs.find_all('script', type="text/javascript")for name in names:t = name.get_text()if len(t) > 0:t = t.replace('window.__SEARCH_RESULT__ = ', '')x = json.loads(t)y = x["engine_search_result"]for j in range(1, len(y)):company_names = y[j]['company_name']job_names = y[j]['job_name']salarys = y[j]['providesalary_text']hrefs = y[j]['job_href']worksheet.write(z, 0, y[j]['job_name'])worksheet.write(z, 1, y[j]['company_name'])worksheet.write(z, 2, y[j]['workarea_text'])worksheet.write(z, 3, y[j]['companytype_text'])worksheet.write(z, 4, y[j]['attribute_text'])urls = hrefsinfos = Request(urls, headers=header)htmls = urlopen(infos)bs2 = bs4.BeautifulSoup(htmls, 'html.parser')try:texts = bs2.find('div', {"class": 'bmsg job_msg inbox'}).get_text().split()job_requests = "".join(texts)worksheet.write(z, 5, label=job_requests)except:worksheet.write(z, 5, label=' ')z += 1workbook.save('前程无忧.xls')

效果图

爬取财富新闻快讯

from urllib.request import urlopen,Request
from bs4 import BeautifulSoup
import re
f=open('财富.txt', 'w', encoding='UTF-8')headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36'}
for i in range(2,10):url ="http://www.cfbond.com/in/cfkxlb/index_{0}.shtml".format(i)ret=Request(url,headers=headers)html = urlopen(ret)bs = BeautifulSoup(html, "html.parser")titles=bs.findAll('h2',{'class':"pubList_tit"})#a=bs.findAll('li')#for j in a:#titles.append(j.find('h2',class_='pubList_tit').get_text().replace('\r','').replace('\n',''))for link in bs.findAll('a'):link=link.get('href')website=Request(link,headers=headers)web=urlopen(website)bs = BeautifulSoup(web, "html.parser")texts=bs.find_all('div', {'class':'s_xlLContCRC'})for text in texts:text=text.get_text().replace(' ','').replace('\r','')f.write(text)
f.close()

效果图

爬取计算机学报的论文

import os
from urllib.request import Request, urlopen
import bs4
from urllib.request import urlretrieve
url='http://cjc.ict.ac.cn/qwjs/No2020-01.htm'
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'}
ret = Request(url=url, headers=header)
html = urlopen(ret)
bs = bs4.BeautifulSoup(html, 'html.parser')
names=[]
divs=bs.find_all('span' ,{'style':'color:#006688'})
links=bs.find_all('a')
i=0
for div in divs:div = div.get_text().encode('iso-8859-1').decode('gbk')names.append(div)
for link in links:pdf=link.get('href')dir = os.path.abspath('C:/Users/86186/essary')work_path = os.path.join(dir, '{}.pdf').format(names[i])urlretrieve(pdf, work_path)i=i+1

效果图

爬取豆瓣音乐

from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
import xlwt
# import timeworkbook = xlwt.Workbook(encoding='utf-8')
worksheet = workbook.add_sheet('My Worksheet')
worksheet.write(0, 0, "排名")
worksheet.write(0, 1, "歌名")
worksheet.write(0, 2, "歌手")
worksheet.write(0, 3, "发表时间")
worksheet.write(0, 4, "音乐类型")
worksheet.write(0, 5, "评分")
worksheet.write(0, 6, "详细链接")j = 1
for i in range(10):url = 'https://music.douban.com/top250?start={}'.format(i * 25)headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'}ret = Request(url, headers=headers)html = urlopen(ret).read()bs = BeautifulSoup(html, "html.parser")div = bs.find("div", {"class": "indent"})divs = div.find_all("div", {"class": "pl2"})for div_pl2 in divs:title = div_pl2.select('a')[0].text.replace(' ', '')title = title.replace('\n', ' ').replace('\r', '')content = div_pl2.find("p", {"class": "pl"}).get_text().split('/')singer = content[0]music_time = content[1]music_type = content[-1]score = div_pl2.find("span", {"class": "rating_nums"}).get_text()link = div_pl2.find('a').get('href')worksheet.write(j, 0, j)worksheet.write(j, 1, title)worksheet.write(j, 2, singer)worksheet.write(j, 3, music_time)worksheet.write(j, 4, music_type)worksheet.write(j, 5, score)worksheet.write(j, 6, link)# time.sleep(1)j += 1
workbook.save('豆瓣音乐Top250.xls')

效果图

爬取小说

(非完整版）

import os
from urllib.request import Request, urlopen
import bs4
from urllib.request import urlretrieve
import time
time.sleep(5)
url='https://www.xstt5.com/'
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'}
ret = Request(url=url, headers=header)
html = urlopen(ret)
bs = bs4.BeautifulSoup(html, 'html.parser')
# names=[]
# divs=bs.find_all('span' ,{'style':'color:#006688'})
u = bs.find_all('ul',{'class':"cl"})
for i in u:links = i.find_all('a')for s in links:link = s.attrs['href']urls = linkinfos = Request(urls, headers=header)htmls = urlopen(infos)bs1 = bs4.BeautifulSoup(htmls, 'html.parser')div = bs1.find_all('div',{'class':"ex"})for j in div:print(j)# i = 1# wangzhan = t.attrs['href']# print(wangzhan)# info = Request(wangzhi, headers=header)# htmlss = urlopen(info)# bs2 = bs4.BeautifulSoup(htmlss, 'html.parser')# dir=os.path.abspath('C://Users/user/essary')# work_path=os.path.join(dir,'{}.txt'.format(i))# urlretrieve(wangzhi,work_path)# urls = link
# infos = Request(urls, headers=header)
# htmls = urlopen(infos)
# bs2 = bs4.BeautifulSoup(htmls, 'html.parser')
# dir=os.path.abspath('C://Users/user/essary')
# work_path=os.path.join(dir,'Python-2.7.5.tar.bz2')
# urlretrieve(urls,work_path)# i=0
# for div in divs:
#     div = div.get_text().encode('iso-8859-1').decode('gbk')
#     names.append(div)
# for link in links:
#     pdf=link.get('href')
#     dir = os.path.abspath('C://Users/user/essary')
#     work_path = os.path.join(dir, '{}.pdf').format(names[i])
#     urlretrieve(pdf, work_path)
#     i=i+1

效果图

待学习：
基础正则表达式

Python之爬虫系列之第一弹相关推荐

python网络爬虫系列教程_Python网络爬虫系列教程连载 ----长期更新中，敬请关注!...
感谢大家长期对Python爱好者社区的支持,后期Python爱好者社区推出Python网络爬虫系列教程.欢迎大家关注.以下系列教程大纲,欢迎大家补充.视频长期连载更新中 --------------- ...
python网络爬虫系列教程——python中requests库应用全解
全栈工程师开发手册 (作者:栾鹏) python教程全解 python中requests库的基础应用,网页数据挖掘的常用库之一.也就是说最主要的功能是从网页抓取数据. 使用前需要先联网安装reques ...
python网络爬虫系列教程——python中pyquery库应用全解
全栈工程师开发手册 (作者:栾鹏) python教程全解 python网络爬虫lxml库的应用全解. 在线安装方法:cmd中输入"pip install pyquery" 离线安装 ...
python网络爬虫系列教程——python中lxml库应用全解（xpath表达式）
全栈工程师开发手册 (作者:栾鹏) python教程全解 python网络爬虫lxml库的应用全解. 在线安装方法:cmd中输入"pip install lxml" 离线安装,下载 ...
python 爬取淘宝第一弹（淘宝登录）
前言 2018年7月份,当时我正在学习爬虫,看过一个教程视频是用selenium爬取淘宝,当时因为种种原因(当然还是因为自己太lan)没有去写,但当11月份想找工作时,想找一个爬虫练手,能够写上简历充 ...
python网络爬虫系列（九）——打码平台的使用
验证码处理学习目标了解验证码的相关知识掌握图片识别引擎的使用了解常见的打码平台掌握通过打码平台处理验证码的方法 1.图片验证码 1.1 什么是图片验证码验证码(CAPTCHA)是& ...
python网络爬虫系列（七）——selenium的介绍 selenium定位获取标签对象并提取数据 selenium的其它使用方法
一.selenium的介绍知识点: 了解 selenium的工作原理了解 selenium以及chromedriver的安装掌握标签对象click点击以及send_keys输入 1. sele ...
python网络爬虫系列（六）——数据提取 lxml模块
一.数据提取-lxml模块知识点了解 lxml模块和xpath语法的关系了解 lxml模块的使用场景了解 lxml模块的安装了解谷歌浏览器xpath helper插件的安装和使用掌握 x ...
python网络爬虫系列教程——python网络数据爬虫误区，让你的爬虫更像人类
1 前言近期,有些朋友问我一些关于如何应对反爬虫的问题.由于好多朋友都在问,因此决定写一篇此类的博客.把我知道的一些方法,分享给大家.博主属于小菜级别,玩爬虫也完全是处于兴趣爱好,如有不足之处,还望 ...

Python之爬虫系列之第一弹

爬取财富500强的内容

爬取豆瓣TOP250

爬取中国大学

爬取中国大学+网站

爬取列维坦风景油画

爬取前程无忧网

爬取财富新闻快讯

爬取计算机学报的论文

爬取豆瓣音乐

爬取小说

Python之爬虫系列之第一弹相关推荐

最新文章

热门文章