网易云课堂 python网络爬虫实战

import requests
newsurl = 'http://news.sina.com.cn/china/'
res = requests.get(newsurl)
res.encoding = 'utf-8'
print res #<Response [200]>
print res.text# 乱码，应为utf-8
print type(res) # <class 'requests.models.Response'>
print res.encoding# ISO-8859-1from bs4 import BeautifulSoup
html_sample = '\
<html>\
<body>\
<h1 id="title">Hello World</h1>\
<a href="#" class="link">This is link1</a>\
<a href="#link2"class="link">This is link2</a>\
</body>\
</html>'
soup = BeautifulSoup(html_sample)
print type(soup) # <class 'bs4.BeautifulSoup'>
print soup.text # Hello WorldThis is link1This is link2
#取特殊标签和节点
#使用select找出含有h1标签的元素
soup = BeautifulSoup(html_sample)
header = soup.select('h1')
print(header)#  [<h1 id="title">Hello World</h1>]
print header[0]# <h1 id="title">Hello World</h1>
print header[0].text# Hello World
#打[0]是html格式，没有python中list格式
#使用select找出含有a标签的元素
alink = soup.select('a')
print alink
# [<a class="link" href="#">This is link1</a>, <a class="link" href="#link2">This is link2</a>]
for link in alink:print link
#==============================================================================
# <a class="link" href="#">This is link1</a>
# <a class="link" href="#link2">This is link2</a>
#==============================================================================
for link in alink:print link.text
#==============================================================================
# This is link1
# This is link2
#==============================================================================
#==============================================================================
# 取得含有特定CSS属性的元素
# 使用select找出所有id为title的元素（id前面需要加#）
# 使用select找出所有class为link的元素（class前面需要加.)
#==============================================================================
alink = soup.select('#title')
print alink # [<h1 id="title">Hello World</h1>]
soup = BeautifulSoup(html_sample)
for link in soup.select('.link'):print link
#==============================================================================
# <a class="link" href="#">This is link1</a>
# <a class="link" href="#link2">This is link2</a>
#==============================================================================
#select找出所有a tag的href连接,通过href可以链接到其他网页,把里面的属性包装成字典，可用[]提取
alinks = soup.select('a')
for link in alinks:print link['href']
#==============================================================================
# #
# #link2
#==============================================================================
a = '<a href="#" qao=123 abc=456> i am a link</a>'
soup2 = BeautifulSoup(a)
print soup2.select('a')[0]# <a abc="456" href="#" qao="123"> i am a link</a>
print soup2.select('a')[0]['abc']#456
print soup2.select('a')[0]['qao']# 123
print soup2.select('a')[0]['href']# #
print soup2.text #  i am a link
#爬取news.sina.com.cn/china/，根据不同html标签取得对应内容
from bs4 import BeautifulSoup
import requests
res = requests.get('http://news.sina.com.cn/china/')
res.encoding = 'utf-8'
soup = BeautifulSoup(res.text)
#取得各个新闻的部分
for news in soup.select('.news-item'):print news
#==============================================================================
# <div class="news-item first-news-item ">
# <h2><a href="http://news.sina.com.cn/c/nd/2017-07-05/doc-ifyhrxsk1800038.shtml" suda-uatrack="key=newschina_index_2014&value=news_link_1" target="_blank">上海一些订餐平台助推私宴牟利 相关部门介入</a></h2>
# <div class="info clearfix ">
# <div class="time">7月5日 09:20</div>
# <div class="action"><a data-id="gn:comos-fyhrxsk1800038:0" href="http://comment5.news.sina.com.cn/comment/skin/default.html?channel=gn&newsid=comos-fyhrxsk1800038&style=0" target="_blank">评论</a><span class="spliter">|</span><span class="bdshare_t bds_tools get-codes-bdshare" data="{text:'上海一些订餐平台助推私宴牟利 相关部门介入',url:'http://news.sina.com.cn/c/nd/2017-07-05/doc-ifyhrxsk1800038.shtml',pic:''}" id="bdshare"><span class="bds_more">分享</span></span></div>
# </div>
# </div>
#==============================================================================
for news in soup.select('.news-item'):print news.select('h2')
#==============================================================================
# [<h2><a href="http://news.sina.com.cn/c/nd/2017-07-05/doc-ifyhrxsk1800038.shtml" suda-uatrack="key=newschina_index_2014&value=news_link_1" target="_blank">上海一些订餐平台助推私宴牟利 相关部门介入</a></h2>]
# [<h2><a href="http://news.sina.com.cn/c/nd/2017-07-05/doc-ifyhryex6189357.shtml" suda-uatrack="key=newschina_index_2014&value=news_link_2" target="_blank">四川暴雨来袭 成都打响211座城市桥梁保卫战</a></h2>]
# []
# []
# ...
# [<h2><a href="http://news.sina.com.cn/c/sd/2016-11-27/doc-ifxyawxa2866597.shtml" target="_blank">湖南从严推进县乡人大换届选举:铭记衡阳案教训</a></h2>]
# [<h2><a href="http://news.sina.com.cn/c/sd/2016-11-21/doc-ifxxwrwh4831425.shtml" target="_blank">北京国I国Ⅱ车辆明年2月15日起五环内限行</a></h2>]
#==============================================================================
for news in soup.select('.news-item'):if len(news.select('h2')) > 0:#原本打印出h2有空的，舍弃print news.select('h2')[0].text
#==============================================================================
# 上海一些订餐平台助推私宴牟利 相关部门介入
# 四川暴雨来袭 成都打响211座城市桥梁保卫战
# 环保部:支持白洋淀治理规划 补齐生态环境短板
# “上海交警”APP短信挪车 暂时不能通知到外牌
# 西北政法大学校长贾宇去职 已在校生活38年
# ...
# 湖南从严推进县乡人大换届选举:铭记衡阳案教训
# 北京国I国Ⅱ车辆明年2月15日起五环内限行
#==============================================================================
for news in soup.select('.news-item'):if len(news.select('h2')) > 0:#原本打印出h2有空的，舍弃h2 = news.select('h2')[0].texta = news.select('a')[0]['href'] # 取链接print h2,a
#==============================================================================
# 舰载机飞行员牺牲细节:4.4秒生死瞬间欲救战机 http://news.sina.com.cn/c/sd/2016-11-28/doc-ifxyawxa2907507.shtml
# 贵州童工多因贫困外出打工 有时连吃盐都成问题 http://news.sina.com.cn/o/2016-11-28/doc-ifxyasmv2025198.shtml
# 湖南从严推进县乡人大换届选举:铭记衡阳案教训 http://news.sina.com.cn/c/sd/2016-11-27/doc-ifxyawxa2866597.shtml
# 北京国I国Ⅱ车辆明年2月15日起五环内限行 http://news.sina.com.cn/c/sd/2016-11-21/doc-ifxxwrwh4831425.shtml
#==============================================================================
#加入时间time
for news in soup.select('.news-item'):if len(news.select('h2')) > 0:#原本打印出h2有空的，舍弃h2 = news.select('h2')[0].texttime = news.select('.time')[0].texta = news.select('a')[0]['href'] # 取链接print time,h2,a
#==============================================================================
# 11月28日 07:50 舰载机飞行员牺牲细节:4.4秒生死瞬间欲救战机 http://news.sina.com.cn/c/sd/2016-11-28/doc-ifxyawxa2907507.shtml
# 11月28日 05:41 贵州童工多因贫困外出打工 有时连吃盐都成问题 http://news.sina.com.cn/o/2016-11-28/doc-ifxyasmv2025198.shtml
# 11月27日 14:01 湖南从严推进县乡人大换届选举:铭记衡阳案教训 http://news.sina.com.cn/c/sd/2016-11-27/doc-ifxyawxa2866597.shtml
# 11月21日 15:00 北京国I国Ⅱ车辆明年2月15日起五环内限行 http://news.sina.com.cn/c/sd/2016-11-21/doc-ifxxwrwh4831425.shtml
#==============================================================================
#抓取内文页面
import requests
from bs4 import BeautifulSoup
res = requests.get('http://news.sina.com.cn/c/nd/2017-07-05/doc-ifyhrxsk1791835.shtml')
res.encoding = 'utf-8'
print res.text #成功
soup = BeautifulSoup(res.text)
#抓取文章标题，在<h1 id="artibodyTitle"....>起底。。</h1>中
alink = soup.select('#artibodyTitle')[0].text
print alink # 起底章莹颖案嫌犯所上网站:仍有大量绑架内容
#取得右方的时间和来源，时间在class="time-source"中
time = soup.select('.time-source')[0]
print time
#==============================================================================
# <span class="time-source" id="navtimeSource">2017年07月05日07:57           <span>
# <span data-sudaclick="media_name"><a href="http://www.thepaper.cn/newsDetail_forward_1724885" rel="nofollow" target="_blank">新浪综合</a></span></span>
# </span>
#==============================================================================
#将时间和来源分开#2017年07月05日07:57
#新浪综合
timesource = soup.select('.time-source')[0].contents[0].strip()
print timesource # 2017年07月05日07:57
type(timesource) # unicode,不属于字符串，需要转换
import json
print json.dumps(timesource, encoding="UTF-8", ensure_ascii=False)
#==============================================================================
# 时间字符串转换,在python3.X中会默认utf-8格式，需要转换。
# 字符串转时间strptime
# from datetime import datetime
# dt = datetime.strptime(timesource, '%Y年%m月%d日%H:%M')
# dt
# 时间转字符串strftime
# dt.strftime('%Y-%m-%d')
#==============================================================================
medianame = soup.select('.time-source span a')[0].text
print medianame # 新浪综合
print json.dumps(medianame, encoding="UTF-8", ensure_ascii=False)
#取得内文
soup.select('#artibody')
#==============================================================================
# [<div class="article article_16" id="artibody">
#  <p>　　来源：澎湃新闻</p>
#  <p>　　原标题：起底章莹颖案嫌犯所上网站：仍有大量绑架内容，卷入多起刑案</p>
#  <p>　　克里斯滕森的车因与最后载走章莹颖的车很相似而被调查。FBI在检查其手机时发现，他曾在4月访问过一家成人社交网站中的“新手绑架课程”论坛，浏览了“完美绑架幻想”和“计划一场绑架”等帖子。FBI在刑事起诉书中称，这对实施绑架起到了作用。 </p>
#  <p>　　澎湃新闻（www.thepaper.cn）调查发现，涉事网站此前就多次卷入刑事案件，引起了执法部门的注意，但网站上至今仍存在大量关于绑架的内容，并未受到明显影响。网站用户的国际性以及内容的“灰色地带”都给监管带来了难度。 </p>
#
#==============================================================================
soup.select('#artibody p')#只要p的内容
soup.select('#artibody p')[:-1]# 去掉最后一个p
#==============================================================================
#  <p>　　据报道，法院将于本周三下午再次举行聆讯，不过克里斯滕森的律师布鲁诺表示，“面对这样的控告，总会是一场艰难的战斗。”布罗诺说，在联邦系统中，如果控告涉及暴力犯罪或者武器，被告很少会被取保候审。</p>,
#  <p>　　布鲁诺还呼吁公众以开放心态看待本案，他指出公众目前了解的信息“并不是故事的全部”，还有许多公众没有意识到的信息。</p>,
#  <p>　　来源：澎湃新闻</p>]
#==============================================================================
article = []
for p in soup.select('#artibody p')[:-1]:article.append(p.text)
type(article)
print article.encode("utf-8",errors = 'ignore')"# 为什么是这样呢！！！！！！！！！
#==============================================================================
# \u4ed6\u6307\u51fa\u516c\u4f17\u76ee\u524d\u4e86\u89e3\u7684\u4fe1\u606f\u201c
# \u5e76\u4e0d\u662f\u6545\u4e8b\u7684\u5168\u90e8\u201d\uff0c\u8fd8\u6709\u8bb8
# \u591a\u516c\u4f17\u6ca1\u6709\u610f\u8bc6\u5230\u7684\u4fe1\u606f\u3002',
#  u'\u3000\u3000\u6765\u6e90\uff1a\u6f8e\u6e43\u65b0\u95fb']
#==============================================================================
print json.dumps(article, encoding="UTF-8", ensure_ascii=False)
#==============================================================================
# ["　　来源：澎湃新闻", "　　原标题：起底章莹颖案嫌犯所上网站：仍有大量绑架内容，卷入多起刑案",
# "　　克里斯滕森的车因与最后载走章莹颖的车很相似而被调查。FBI在检查其手机时发现，他曾在4月访问过一家成人社交网站中的“新手绑架课程”论坛，浏览了“完美绑架幻想”和“计划一场绑架”等帖子。FBI在刑事起诉书中称，这对实施绑架起到了作用。 ",
#  "　...]
#==============================================================================
' '.join(article)
[p.text.strip() for p in soup.select('#artibody p')[:-1]]
' '.join([p.text.strip() for p in soup.select('#artibody p')[:-1]])

网易云课堂 python网络爬虫实战相关推荐

Python 网络爬虫实战：去哪儿网旅游攻略图文爬取保存为 Markdown电子书
接上回,<Python 网络爬虫实战:爬取<去哪儿>网数千篇旅游攻略数据>. 我们爬取到了数千篇的旅游攻略文章的数据. 但是事情还没有结束,对于大部分的人来讲,最希望得到的东西 ...
Python网络爬虫实战案例之：7000本电子书下载（2）
一.前言本文是<Python开发实战案例之网络爬虫>的第二部分:7000本电子书下载网络爬虫开发环境安装部署.配套视频课程详见网易云课堂二.章节目录 (1)Python开发环境依赖 ( ...
python爬虫文件代码大全-Python网络爬虫实战项目代码大全（长期更新，欢迎补充）...
WechatSogou[1]- 微信公众号爬虫.基于搜狗微信搜索的微信公众号爬虫接口,可以扩展成基于搜狗搜索的爬虫,返回结果是列表,每一项均是公众号具体信息字典.[1]: https://github ...
python常用代码大全-Python 网络爬虫实战项目代码大全
原标题:Python 网络爬虫实战项目代码大全 DouBanSpider 豆瓣读书的爬虫.你可以爬豆瓣读书下面标签下的所有图书,按评分排名依次存储,存储到Excel中,可方便大家筛选搜罗,比如筛选评价 ...
python基础代码大全-Python网络爬虫实战项目代码大全（长期更新，欢迎补充）
WechatSogou[1]- 微信公众号爬虫.基于搜狗微信搜索的微信公众号爬虫接口,可以扩展成基于搜狗搜索的爬虫,返回结果是列表,每一项均是公众号具体信息字典.[1]: https://github ...
Python网络爬虫实战项目代码大全（长期更新，欢迎补充）
Python网络爬虫实战项目代码大全(长期更新,欢迎补充) 阿橙 · 1 个月内 WechatSogou [1]- 微信公众号爬虫.基于搜狗微信搜索的微信公众号爬虫接口,可以扩展成基于搜狗搜索的爬虫, ...
python基础实例韦玮 pdf_韦玮：Python网络爬虫实战解析
2016年12月27日晚8点半,CSDN特邀IT专家.<Python系列实战教程>系列图书作者韦玮带来了主题为"Python网络爬虫反爬破解策略实战"的Chat交流.以 ...
Python网络爬虫实战：根据天猫胸罩销售数据分析中国女性胸部大小分布
Python网络爬虫实战:根据天猫胸罩销售数据分析中国女性胸部大小分布本文实现一个非常有趣的项目,这个项目是关于胸罩销售数据分析的.是网络爬虫和数据分析的综合应用项目.本项目会从天猫抓取胸罩销售数据 ...
Python网络爬虫实战：近千条中秋节祝福语文案让你成为亲朋好友里最靓的仔
中秋节马上到了,不知道大家有没有像我这样的烦恼,每次过节,都要绞尽脑汁想好久,发什么样的祝福语才显得有诚意又有创意,什么样的朋友圈文案会有文化又有逼格. 去网上搜吧,搜出来的祝福语,画风大多是像这样的 ...
python商业爬虫教程_廖雪峰老师的Python商业爬虫课程 Python网络爬虫实战教程体会不一样的Python爬虫课程...
廖雪峰老师的Python商业爬虫课程 Python网络爬虫实战教程体会不一样的Python爬虫课程 1.JPG (53.51 KB, 下载次数: 1) 2019-8-9 08:15 上传 2.JPG ...

网易云课堂 python网络爬虫实战

网易云课堂 python网络爬虫实战相关推荐

最新文章

热门文章