爬取财富500强的内容

from urllib.request import urlopen,Request
from bs4 import BeautifulSoup
import re
url = "http://www.fortunechina.com/fortune500/c/2020-08/10/content_372148.htm"
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36'}
ret=Request(url,headers=headers)
html=urlopen(ret)
bs=BeautifulSoup(html,"html.parser")
tr=bs.find('tbody').find_all('tr')
#listall=[]
for j in tr[0:]:td=j.find_all('td')number=td[0].get_text().strip()names=td[1].get_text().strip()income=td[2].get_text().strip()profit=td[3].get_text().strip()country=td[4].get_text().strip()list_1 = number, income, profit, country, nameslist_2 = '{0:<20}\t{1:<20}\t{2:<20}\t{3:<20}\t{4:<20}'.format(number, income, profit, country, names,chr(12288))print(list_2)

效果图

爬取豆瓣TOP250

from urllib.request import urlopen,Request
from bs4 import BeautifulSoup
import re
for i in range(0,250,25):url="https://movie.douban.com/top250?start={0}&filter=".format(i)headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36'}ret=Request(url,headers=headers)html=urlopen(ret)bs=BeautifulSoup(html,"html.parser")names=bs.findAll('span',{'class':"title"})scores=bs.findAll('span',{'class':"rating_num"})numbers=bs.findAll('em',{'class':""})number_list=[]name_list=[]score_list=[]for name in names:name=name.get_text()if name[1] != '/':name_list.append(name)for number,name,score in zip(numbers,name_list,scores):score=score.get_text()number=number.get_text()print(number,name,score)

效果图

爬取中国大学

from urllib.request import urlopen,Request
from bs4 import BeautifulSoup
import re
url="http://www.shanghairanking.cn/rankings/bcur/2020"
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36'}
ret=Request(url,headers=headers)
html=urlopen(ret)
bs=BeautifulSoup(html,"html.parser")
#tbody-tr-td
tr=bs.find('tbody').find_all('tr')
#listall=[]
for j in tr[0:]:td=j.find_all('td')number=td[0].get_text().strip()college=td[1].get_text().strip()province=td[2].get_text().strip()type=td[3].get_text().strip()score=td[4].get_text().strip()#list_1=[]list_1=number,college,province,type,score#listall.append(list_1)print(number,college,province,type,score)

效果图

爬取中国大学+网站

from urllib.request import urlopen,Request
from bs4 import BeautifulSoup
import re
url="http://www.shanghairanking.cn/rankings/bcur/2020"
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36'}
ret=Request(url,headers=headers)
html=urlopen(ret)
bs=BeautifulSoup(html,"html.parser")
tr=bs.find('tbody').find_all('tr')
#listall=[]
for j in tr[0:]:td=j.find_all('td')number=td[0].get_text().strip()college=td[1].get_text().strip()province=td[2].get_text().strip()type=td[3].get_text().strip()score=td[4].get_text().strip()a=j.find_all('a')wangzhi=a[0].get('href')link='http://www.shanghairanking.cn{0}'.format(wangzhi)list_1=number,college,province,type,score,linkprint(number,college,province,score,link)

效果图

爬取列维坦风景油画

import os
from urllib.request import Request, urlopen
import bs4
from urllib.request import urlretrieve
url='https://www.sohu.com/a/286956359_301394'
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'}
ret = Request(url=url, headers=header)
html = urlopen(ret)
bs = bs4.BeautifulSoup(html, 'html.parser')
pictures=bs.find('article',{'class':"article"}).find_all('img')
i=1
for picture in pictures:picture = picture.attrs['src']dir = os.path.abspath('C:/Users/86186/picture')work_path = os.path.join(dir,'picture{}.jpeg').format(i)i=i+1urlretrieve(picture, work_path)

效果图

爬取前程无忧网

from urllib.request import Request, urlopen
import bs4
import requests
import re
import json
import xlwtworkbook = xlwt.Workbook(encoding='utf-8')
worksheet = workbook.add_sheet('前程无忧招聘信息')
worksheet.write(0, 0, label='工作名称')
worksheet.write(0, 1, label='公司名称')
worksheet.write(0, 2, label='工作地区')
worksheet.write(0, 3, label='公司属性')
worksheet.write(0, 4, label='职位要求')
worksheet.write(0, 5, label='职责要求')
z = 1for x in range(1,6):url=('https://search.51job.com/list/080200,000000,0000,00,9,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2,'+str(x))+'.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'}ret = Request(url=url, headers=header)html = urlopen(ret)bs = bs4.BeautifulSoup(html, 'html.parser')names=bs.find_all('script', type="text/javascript")for name in names:t = name.get_text()if len(t) > 0:t = t.replace('window.__SEARCH_RESULT__ = ', '')x = json.loads(t)y = x["engine_search_result"]for j in range(1, len(y)):company_names = y[j]['company_name']job_names = y[j]['job_name']salarys = y[j]['providesalary_text']hrefs = y[j]['job_href']worksheet.write(z, 0, y[j]['job_name'])worksheet.write(z, 1, y[j]['company_name'])worksheet.write(z, 2, y[j]['workarea_text'])worksheet.write(z, 3, y[j]['companytype_text'])worksheet.write(z, 4, y[j]['attribute_text'])urls = hrefsinfos = Request(urls, headers=header)htmls = urlopen(infos)bs2 = bs4.BeautifulSoup(htmls, 'html.parser')try:texts = bs2.find('div', {"class": 'bmsg job_msg inbox'}).get_text().split()job_requests = "".join(texts)worksheet.write(z, 5, label=job_requests)except:worksheet.write(z, 5, label=' ')z += 1workbook.save('前程无忧.xls')

效果图

爬取财富新闻快讯

from urllib.request import urlopen,Request
from bs4 import BeautifulSoup
import re
f=open('财富.txt', 'w', encoding='UTF-8')headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36'}
for i in range(2,10):url ="http://www.cfbond.com/in/cfkxlb/index_{0}.shtml".format(i)ret=Request(url,headers=headers)html = urlopen(ret)bs = BeautifulSoup(html, "html.parser")titles=bs.findAll('h2',{'class':"pubList_tit"})#a=bs.findAll('li')#for j in a:#titles.append(j.find('h2',class_='pubList_tit').get_text().replace('\r','').replace('\n',''))for link in bs.findAll('a'):link=link.get('href')website=Request(link,headers=headers)web=urlopen(website)bs = BeautifulSoup(web, "html.parser")texts=bs.find_all('div', {'class':'s_xlLContCRC'})for text in texts:text=text.get_text().replace(' ','').replace('\r','')f.write(text)
f.close()

效果图

爬取计算机学报的论文

import os
from urllib.request import Request, urlopen
import bs4
from urllib.request import urlretrieve
url='http://cjc.ict.ac.cn/qwjs/No2020-01.htm'
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'}
ret = Request(url=url, headers=header)
html = urlopen(ret)
bs = bs4.BeautifulSoup(html, 'html.parser')
names=[]
divs=bs.find_all('span' ,{'style':'color:#006688'})
links=bs.find_all('a')
i=0
for div in divs:div = div.get_text().encode('iso-8859-1').decode('gbk')names.append(div)
for link in links:pdf=link.get('href')dir = os.path.abspath('C:/Users/86186/essary')work_path = os.path.join(dir, '{}.pdf').format(names[i])urlretrieve(pdf, work_path)i=i+1

效果图

爬取豆瓣音乐

from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
import xlwt
# import timeworkbook = xlwt.Workbook(encoding='utf-8')
worksheet = workbook.add_sheet('My Worksheet')
worksheet.write(0, 0, "排名")
worksheet.write(0, 1, "歌名")
worksheet.write(0, 2, "歌手")
worksheet.write(0, 3, "发表时间")
worksheet.write(0, 4, "音乐类型")
worksheet.write(0, 5, "评分")
worksheet.write(0, 6, "详细链接")j = 1
for i in range(10):url = 'https://music.douban.com/top250?start={}'.format(i * 25)headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'}ret = Request(url, headers=headers)html = urlopen(ret).read()bs = BeautifulSoup(html, "html.parser")div = bs.find("div", {"class": "indent"})divs = div.find_all("div", {"class": "pl2"})for div_pl2 in divs:title = div_pl2.select('a')[0].text.replace(' ', '')title = title.replace('\n', ' ').replace('\r', '')content = div_pl2.find("p", {"class": "pl"}).get_text().split('/')singer = content[0]music_time = content[1]music_type = content[-1]score = div_pl2.find("span", {"class": "rating_nums"}).get_text()link = div_pl2.find('a').get('href')worksheet.write(j, 0, j)worksheet.write(j, 1, title)worksheet.write(j, 2, singer)worksheet.write(j, 3, music_time)worksheet.write(j, 4, music_type)worksheet.write(j, 5, score)worksheet.write(j, 6, link)# time.sleep(1)j += 1
workbook.save('豆瓣音乐Top250.xls')

效果图

爬取小说

(非完整版)

import os
from urllib.request import Request, urlopen
import bs4
from urllib.request import urlretrieve
import time
time.sleep(5)
url='https://www.xstt5.com/'
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'}
ret = Request(url=url, headers=header)
html = urlopen(ret)
bs = bs4.BeautifulSoup(html, 'html.parser')
# names=[]
# divs=bs.find_all('span' ,{'style':'color:#006688'})
u = bs.find_all('ul',{'class':"cl"})
for i in u:links = i.find_all('a')for s in links:link = s.attrs['href']urls = linkinfos = Request(urls, headers=header)htmls = urlopen(infos)bs1 = bs4.BeautifulSoup(htmls, 'html.parser')div = bs1.find_all('div',{'class':"ex"})for j in div:print(j)# i = 1# wangzhan = t.attrs['href']# print(wangzhan)# info = Request(wangzhi, headers=header)# htmlss = urlopen(info)# bs2 = bs4.BeautifulSoup(htmlss, 'html.parser')# dir=os.path.abspath('C://Users/user/essary')# work_path=os.path.join(dir,'{}.txt'.format(i))# urlretrieve(wangzhi,work_path)# urls = link
# infos = Request(urls, headers=header)
# htmls = urlopen(infos)
# bs2 = bs4.BeautifulSoup(htmls, 'html.parser')
# dir=os.path.abspath('C://Users/user/essary')
# work_path=os.path.join(dir,'Python-2.7.5.tar.bz2')
# urlretrieve(urls,work_path)# i=0
# for div in divs:
#     div = div.get_text().encode('iso-8859-1').decode('gbk')
#     names.append(div)
# for link in links:
#     pdf=link.get('href')
#     dir = os.path.abspath('C://Users/user/essary')
#     work_path = os.path.join(dir, '{}.pdf').format(names[i])
#     urlretrieve(pdf, work_path)
#     i=i+1

效果图

待学习:
基础正则表达式

Python之爬虫系列之第一弹相关推荐

  1. python网络爬虫系列教程_Python网络爬虫系列教程连载 ----长期更新中,敬请关注!...

    感谢大家长期对Python爱好者社区的支持,后期Python爱好者社区推出Python网络爬虫系列教程.欢迎大家关注.以下系列教程大纲,欢迎大家补充.视频长期连载更新中 --------------- ...

  2. python网络爬虫系列教程——python中requests库应用全解

    全栈工程师开发手册 (作者:栾鹏) python教程全解 python中requests库的基础应用,网页数据挖掘的常用库之一.也就是说最主要的功能是从网页抓取数据. 使用前需要先联网安装reques ...

  3. python网络爬虫系列教程——python中pyquery库应用全解

    全栈工程师开发手册 (作者:栾鹏) python教程全解 python网络爬虫lxml库的应用全解. 在线安装方法:cmd中输入"pip install pyquery" 离线安装 ...

  4. python网络爬虫系列教程——python中lxml库应用全解(xpath表达式)

    全栈工程师开发手册 (作者:栾鹏) python教程全解 python网络爬虫lxml库的应用全解. 在线安装方法:cmd中输入"pip install lxml" 离线安装,下载 ...

  5. python 爬取淘宝第一弹(淘宝登录)

    前言 2018年7月份,当时我正在学习爬虫,看过一个教程视频是用selenium爬取淘宝,当时因为种种原因(当然还是因为自己太lan)没有去写,但当11月份想找工作时,想找一个爬虫练手,能够写上简历充 ...

  6. python网络爬虫系列(九)——打码平台的使用

    验证码处理 学习目标 了解 验证码的相关知识 掌握 图片识别引擎的使用 了解 常见的打码平台 掌握 通过打码平台处理验证码的方法 1.图片验证码 1.1 什么是图片验证码 验证码(CAPTCHA)是& ...

  7. python网络爬虫系列(七)——selenium的介绍 selenium定位获取标签对象并提取数据 selenium的其它使用方法

    一.selenium的介绍 知识点: 了解 selenium的工作原理 了解 selenium以及chromedriver的安装 掌握 标签对象click点击以及send_keys输入 1. sele ...

  8. python网络爬虫系列(六)——数据提取 lxml模块

    一.数据提取-lxml模块 知识点 了解 lxml模块和xpath语法的关系 了解 lxml模块的使用场景 了解 lxml模块的安装 了解 谷歌浏览器xpath helper插件的安装和使用 掌握 x ...

  9. python网络爬虫系列教程——python网络数据爬虫误区,让你的爬虫更像人类

    1 前言 近期,有些朋友问我一些关于如何应对反爬虫的问题.由于好多朋友都在问,因此决定写一篇此类的博客.把我知道的一些方法,分享给大家.博主属于小菜级别,玩爬虫也完全是处于兴趣爱好,如有不足之处,还望 ...

最新文章

  1. 何恺明团队新作:只用普通ViT,不做分层设计也能搞定目标检测
  2. 企业网站建设中seo不能缺
  3. Factorized TDNN(因子分解TDNN,TDNN-F)
  4. flex 会使div撑满_如何讲清楚Flex弹性盒模型?(中)
  5. 女友刷B站流量告急,技术男友轻松搞定!
  6. 将前端文件和Go程序打包
  7. python中的set和dict_Python中dict和set的用法讲解
  8. 物联网与万物互联有什么区别?
  9. qt中QMap与QMultimap用foreach遍历
  10. python小白从哪来开始-小白学python(1)——从selenium开始
  11. [SCM]源码管理 - SVN:externals
  12. 【信息系统项目管理师】第7章-项目成本管理 知识点详细整理
  13. android组合键截图原理,步步高vivo x7怎么截屏 组合键截图方法教程
  14. 前端遇到GET https://XXXX net::ERR_HTTP2_PROTOCOL_ERROR 200问题的解决办法
  15. 计算机专业轻薄本还是游戏本,十大精品笔记本电脑(高端轻薄本和高端游戏本)...
  16. 小武与剑指offer的恩怨情仇
  17. OAuth 2.0 授权认证详解
  18. Learning Transferable Visual Models From Natural Language Supervision
  19. 什么是802.11ac和802.11ac Wave2
  20. HTML基础介绍和基础骨架

热门文章

  1. JavaMD5工具类(加盐加密)
  2. Machine Learning学习笔记(十)K-means聚类算法
  3. [OpenCV] 制作个人视频
  4. Ubuntu16.04安装caffe问题总结以及解决方案
  5. java定时任务cron表达式
  6. 曾哥传——番外篇(二) 曾经有个MM主动让我摸她
  7. Problem 1055 - 魔兽争霸考试
  8. 2021入行须知:IT互联网行业岗位盘点,需求前景一文看懂
  9. 【保研】最近参与的保研夏令营
  10. electron复制粘贴