一、response响应参数

1、response六个参数

response.read().decode().encode() # decode()解码  encode()编码
response.readline()#读取一行
response.readlines()# 读取所有，是二进制
response.geturl()#请求的路由
response.getheaders()#获取响应头
response.getcode()#200 响应状态码

2、代码实现

import urllib.requesturl='http://www.baidu.com/'response=urllib.request.urlopen(url=url)#.decode()解码  encode()编码  read()读取全部内容，游标后移动到网页最后 所以后面没有
# print (response.read().decode().encode())
print (response.read().decode())#读取一行
print (response.readline())# 读取所有，是二进制
print(response.readlines())#路由
print(response.geturl())#http://www.baidu.com/#获取响应头
print(response.getheaders())
#[('Bdpagetype', '1'), ('Bdqid', '0x820558ba00024ee3'), ('Cache-Control', 'private'), ('Content-Type', 'text/html'), ('Cxy_all', 'baidu+0c2a38edaddb678d40838303d1f94212'), ('Date', 'Mon, 22 Oct 2018 06:33:29 GMT'), ('Expires', 'Mon, 22 Oct 2018 06:33:09 GMT'), ('P3p', 'CP=" OTI DSP COR IVA OUR IND COM "'), ('Server', 'BWS/1.1'), ('Set-Cookie', 'BAIDUID=2EF25B13DB45A9EEF4DD58F5E957A73A:FG=1; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com'), ('Set-Cookie', 'BIDUPSID=2EF25B13DB45A9EEF4DD58F5E957A73A; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com'), ('Set-Cookie', 'PSTM=1540190009; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com'), ('Set-Cookie', 'delPer=0; path=/; domain=.baidu.com'), ('Set-Cookie', 'BDSVRTM=0; path=/'), ('Set-Cookie', 'BD_HOME=0; path=/'), ('Set-Cookie', 'H_PS_PSSID=1458_21094_26350_22075; path=/; domain=.baidu.com'), ('Vary', 'Accept-Encoding'), ('X-Ua-Compatible', 'IE=Edge,chrome=1'), ('Connection', 'close'), ('Transfer-Encoding', 'chunked')]#200  状态码
print(response.getcode())

二、urlretrieve下载文件

1、urlretrieve参数

url = html_url = 'https://www.baidu.com/'  #（下载的文件的网址）
filename = filenames = r"E:\Knowledge\爬虫\files\baidu1.html" #（存放下载文件的路径以及名称）

2、代码实现

#本地存储  网页下载  视频下载  图片下载
import urllib.request
def html_download():html_url='https://www.baidu.com/'filenames=r"E:\Project_python\python_directory\Knowledge\爬虫\files\baidu1.html"urllib.request.urlretrieve(url=html_url,filename=filenames)def picture_download():picture_url = 'http://img2.ali213.net/picfile/News/2018/10/22/2018102271311324.jpg'filenames = r"E:\Project_python\python_directory\Knowledge\爬虫\files\RNG.png"urllib.request.urlretrieve(url=picture_url, filename=filenames)def movie_download():movie_url='http://v11-tt.ixigua.com/9afa9be76829c9c86efc5ed457b5e40b/5bcdd016/video/m/2206a2ae6c28c2044c29a8a93a76e7ad720115bc34400002caef5bf2788/'filenames= r"E:\Project_python\python_directory\Knowledge\爬虫\files\回家.MP4"urllib.request.urlretrieve(url=movie_url,filename=filenames)if __name__=="__main__":#html_download()#picture_download()#movie_download()

三、Get请求路由参数

1、单个参数

1、方法

urllib.request.quote(name) #name为字符串

2、代码实现

def http_get_quote():get_url = 'http://www.baidu.com/s?wd='name = input('输入关键字：')gjz=urllib.request.quote(name)#单个关键字处理-也就是格式化处理，符合路由urls=get_url+gjzresponse=urllib.request.urlopen(url=urls)print(response.read().decode())

2、多个参数

1、方法

urllib.parse.urlencode(data) #多关键字，将其整理为字典形式（data），通过urlencode路由编码

2、代码实现

def http_get_urlencode():get_url = 'http://www.baidu.com/s?'name = input('输入关键字：')data = {'wd': name,}urldata = urllib.parse.urlencode(data)#多关键字，将其整理为字典形式，通过urlencode路由编码urls = get_url + urldataprint(urls)response = urllib.request.urlopen(url=urls)print(response.read().decode())

四、Get AND Post

1、Get请求

def http_get():get_url = 'http://www.baidu.com/s?'name = input('输入关键字：')data = {'wd': name,}data = urllib.parse.urlencode(data)url = get_url + dataheaders = {"User-Agnet": "Mozilla/5.0 (Linux; U; Android 4.0.3; zh-cn; M032 Build/IML74K) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30"}request = urllib.request.Request(url=url, headers=headers)response = urllib.request.urlopen(request)print(response.read().decode('utf-8'))

2、Post请求

def http_post():post_url="https://fanyi.baidu.com/sug"keyword=input("输入搜索的单词:")data={'kw': keyword}#相较于get需要在字典的路由编码中添加编码方式'utf-8'data=urllib.parse.urlencode(data).encode('utf-8')headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}#get是将参数放在路由请求中，而post是现代form表单，属于不可见request = urllib.request.Request(url=post_url, headers=headers, data=data, )response = urllib.request.urlopen(request, timeout=0.5)except Exception as re:print("请求超时")print(response.read().decode("utf-8"))

五、请求方式的定制

1、代码实现

import urllib.request,urllib.parsedef headers_add():url = 'http://www.baidu.com/s?'data = {'wd': '韩红','age': 25}urls = url + urllib.parse.urlencode(data)headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',}#请求方式的定值，也就是设置单独的request,不同的请求头、是get还是postrequest = urllib.request.Request(url=urls, headers=headers)response = urllib.request.urlopen(request)print(response.read().decode())

六、Get方式获取Ajax

1、代码实现

获取豆瓣电影的数据，通过定制的get请求，模拟浏览器通过ajax向服务器发送请求，获取json数据

import urllib.request
import urllib.parsedef Ajax_request():page=int(input("请输入页码："))url_ajax='https://movie.douban.com/j/chart/top_list?'headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.36',}pagenumber=(page-1)*20data={ #通过观察请求页面的request的参数，形成下面的参数数据字典'type':'24','interval_id':'100:90','action':'爱情','start':pagenumber,'limit':'20',}data=urllib.parse.urlencode(data)urls=url_ajax+datarequest=urllib.request.Request(url=urls,headers=headers)respone=urllib.request.urlopen(request)context=respone.read().decode('utf-8')with open(r'E:\Project_python\……\douban.json','w',encoding='utf-8') as fp:fp.write(context)

七、简单封装

1、豆瓣、页面、封装

import urllib.request
import urllib.parsedef create_request(type,page):#定制请求方式#https://movie.douban.com/tag/动作?start=60&type=Turl_post='https://movie.douban.com/tag/'headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.36',}page=(page-1)*20data={"start":page,'type': 'T',}type=urllib.request.quote(type)data=urllib.parse.urlencode(data)url=url_post+type+'?'+dataprint(url)request=urllib.request.Request(url=url,headers=headers)return requestdef save_context(request):#发送请求，接收响应并返回，返回的是页面源码（字符串形式）response=urllib.request.urlopen(request)context=response.read().decode('utf-8')return contextdef down_load(page,context):#下载并将页面保存到指定的页面url=r'E:\Project_python\python_directory\Knowledge\爬虫\files\douban_fengzhuang'filename=url + '\douban_'+str(page)+'.html'with open(filename,'w',encoding='utf-8')as fp:fp.write(context)if __name__=="__main__":type=input("类型：")startpage=int(input("开始页："))endpage=int(input("结束页:"))for page in range(startpage,endpage+1):request=create_request(type,page)context=save_context(request)down_load(page, context)

2、KFC、Json、封装

import urllib.request
import urllib.parsedef create_request(cname,page):post_url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname'headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}data = {'cname':cname,'pid':'','pageIndex':page,'pageSize':'10'}data = urllib.parse.urlencode(data).encode('utf-8')request = urllib.request.Request(url = post_url,headers=headers,data=data)return requestdef save_content(request):response = urllib.request.urlopen(request)content = response.read()#当不跟着写decode（）解码函数时候，将返回的是二进制，下面要写为"wb"return contentdef down_load(content,page):# 如果响应的数据类型是二进制  那么写入文件就需要使用wb的模式# 使用wb模式的时候  是不能指定编码的url = r'E:\Project_python\python_directory\Knowledge\爬虫\files\douban_fengzhuang'filename = url + '\KFC_' + str(page) + '.html'with open(filename, 'wb')as fp:fp.write(content)if __name__ == '__main__':cname = input('请输入你要查询的地点')start_page = int(input('请输入起始页码'))end_page = int(input('请输入结束页码'))for page in  range(start_page,end_page+1):request = create_request(cname,page)content = save_content(request)down_load(content,page)

八、代理吃和快代理

1、代理池

import urllib.request
import randomdef agent():url = 'https://www.baidu.com/s?wd=ip'headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}request = urllib.request.Request(url=url, headers=headers)# 代理池 实际就是找到一些好用的代理ip 然后存储到列表中# 进行随机选择proxylist = [{'https': '114.116.10.21:3128'},{'https': '120.92.74.189:3128'},{'https': '113.200.56.13:8010'},]proxies = random.choice(proxylist)# ProxyHandler中的参数名字固定为proxies 没有其他参数handler = urllib.request.ProxyHandler(proxies=proxies)opener = urllib.request.build_opener(handler)response = opener.open(request)content = response.read().decode('utf-8')with open(r'E:\Project_python\python_directory\Knowledge\爬虫\files\zaza\ip1.html', 'w', encoding='utf-8')as fp:fp.write(content)

2、快代理

import urllib.request
#也就网上购买代理ip，从而隐藏自身ip信息。
def Fast_agent():#网站购买的高匿ip，得到的IP地址以及接口号，（无效）url = 'http://kps.kdlapi.com/api/getkps/?orderid=914028249605725&num=1&pt=1&sep=1'response = urllib.request.urlopen(url=url)ip1 = response.read().decode('utf-8')print(ip1)#ip1='114.116.10.21:3128'handler = urllib.request.ProxyHandler(proxies={'https': ip1})opener = urllib.request.build_opener(handler)url1 = 'https://www.baidu.com/s?wd=ip'headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36' }request = urllib.request.Request(url=url1, headers=headers)response1 = opener.open(request)content1 = response1.read().decode('utf-8')with open('ip2.html', 'w', encoding='utf-8')as fp:fp.write(content1)

九、cookiejar—全书网

1、代码实现

#在未登录的情况下，进入全书网的个人藏书架界面，思路：先模仿登录请求，然后携带cookie访问藏书架
#如何抓取登陆接口,
import urllib.request
import urllib.parse
import http.cookiejarpost_url = 'http://www.quanshuwang.com/login.php?do=submit'headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
}
data = {'username': 'action','password': 'action','action': 'login',
}
data = urllib.parse.urlencode(data).encode('gbk')request = urllib.request.Request(url=post_url,headers=headers,data=data)#创建cookiejar对象
ck = http.cookiejar.CookieJar()#把cookiejar对象当作参数传给HTTPCookieProcessor
hander = urllib.request.HTTPCookieProcessor(ck)#获取opener对象
opener = urllib.request.build_opener(hander)#浏览器向服务器发送请求  并且将cookie信息保存到了opener中
#下次通过operer对象访问服务器的时候 会携带者该cookie
response = opener.open(request)# print(response.read().decode('gbk'))
get_url = 'http://www.quanshuwang.com/modules/article/bookcase.php'request1 = urllib.request.Request(url=get_url,headers=headers)# response1 = urllib.request.urlopen(request1)
response1 = opener.open(request1)
print(response1.read().decode('gbk'))

壹、爬虫进阶（聚焦爬虫）

十、XPath：XML路径语言

1、简介

XPath即为XML路径语言（XML Path Language），它是一种用来确定XML文档中某部分位置的语言。

XPath基于XML的树状结构，提供在数据结构树中找寻节点的能力。起初XPath的提出的初衷是将其作为一个通用的、介于XPointer与XSL间的语法模型。但是XPath很快的被开发者采用来当作小型查询语言。

2、xpath语法

表达式      /       描述
nodename        选取此节点的所有子节点。
/               从根节点选取。
//              从匹配选择的当前节点选择文档中的节点，而不考虑它们的位置。
.               选取当前节点。
..              选取当前节点的父节点。div[@id="mat"]  #谓词（属性）查询(id唯一，class不唯一)
div[contains(@id, "he")] #模糊查询id属性包含he
div[starts-with(@id, "he")] #模糊查询id属性以he开头
//div[ends-with(@id, "he")] #模糊查询id属性以he结尾
——//div/h1——/text()  #以/text()为标签中内容//div[@id="head" and @class="s_down"] #逻辑运算与
//title | //price  #逻辑运算或

3、安装步骤

1、安装lxml库   pip install lxml
2、安装xpath插件将xpath.crx拖到浏览器可扩展程序中开打xpath调试框式 ctrl + shift + x
3、在python文件中  from lxml import etree
4、创建对象（1）html_tree = etree.parse('XX.html')#解析本地html文件（2）html_tree = etree.HTML(rsponse.read().decode('utf-8')#解析响应文件
5、解析并返回结果list=html_tree.xpath("xpath路径")#返回为一个列表

4、代码实现

import urllib.request
from lxml import etree  #重点def create_request(page):url = 'https://www.qiushibaike.com/8hr/page/'headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}url = url + str(page) + '/'request = urllib.request.Request(url=url, headers=headers)return requestdef save_content(request):response = urllib.request.urlopen(request)content = response.read().decode('utf-8')return contentdef parse_content(content): #重点。其他部分相似，该部分则是匹配并返回指定数据列表tree = etree.HTML(content)#当前xpath路径 返回的是img标签中的src的值src_list = tree.xpath('//div[@class="thumb"]//img/@src')return src_listdef down_load(src_list):for src in src_list:src = 'https:' + srcname = src.split('/')[-1]file_name = 'qiushibaike/' + nameurllib.request.urlretrieve(url = src,filename=file_name)if __name__ == '__main__':start_page = int(input('请输入起始页码'))end_page= int(input('请输入结束页码'))for page in range(start_page,end_page+1):request = create_request(page)content = save_content(request)src_list = parse_content(content)down_load(src_list)

十一、JsonPath

1. 介绍

类似于XPath在xml文档中的定位，JsonPath表达式通常是用来路径检索或设置Json的。其表达式可以接受“dot–notation”和“bracket–notation”格式，

“dot–notation”：$.store.book[0].title
“bracket–notation”：$[‘store’][‘book’][0][‘title’]

2、操作符

符号	描述
$	查询的根节点对象，用于表示一个json数据，可以是数组或对象
@	过滤器断言（filter predicate）处理的当前节点对象，类似于java中的this字段
*	通配符，可以表示一个名字或数字
…	可以理解为递归搜索，Deep scan. Available anywhere a name is required.
.	表示一个子节点
[‘’ (, ‘’)]	表示一个或多个子节点
[ (, )]	表示一个或多个数组下标
[start:end]	数组片段，区间为[start,end),不包含end
[?()]	过滤器表达式，表达式结果必须是boolean

3、资料

JSONPath 表达式

JSONPath 是参照，xpath表达式来解析xml文档的方式，json数据结构通常是匿名的并且不一定需要有根元素。JSONPaht 用一个抽象的名字$来表示最外层对象。

JOSNPath 表达式可以使用. 符号如下：

$.store.book[0].title

或者使用[] 符号

$['store']['book'][0]['title']

从输入路径来看。内部或者输出的路径都会转化成-符号。

JSONPath 允许使用通配符 * 表示所以的子元素名和数组索引。还允许使用 ‘…’ 从E4X参照过来的和数组切分语法[start:end:step]是从ECMASCRIPT 4 参照过来的。

表达式在下面的脚本语言中可以使用显示的名称或者索引：

$.store.book[(@.length-1)].title

使用’@'符号表示当前的对象，?(<判断表达式>) 使用逻辑表达式来过滤。

$.store.book[?(@.price < 10)].title

这里有个表格，说明JSONPath语法元素和对应XPath元素的对比。

XPath	JSONPath	Description
/	$	表示根元素
.	@	当前元素
/	. or []	子元素
…	n/a	父元素
//	…	递归下降，JSONPath是从E4X借鉴的。
*	*	通配符，表示所有的元素
@	n/a	属性访问字符
[]	[]	子元素操作符
\|	[,]	连接操作符在XPath 结果合并其它结点集合。JSONP允许name或者数组索引。
n/a	[start 零、爬虫基础（通用爬虫）相关推荐什么是网络爬虫以及通用爬虫和聚焦爬虫的介绍什么是网络爬虫 1.为什么要学习爬虫技术? 2.爬虫的定义 3.爬虫的更多用途 4.爬虫怎么抓取网页上的数据? 1.网页三大特征: 2.爬虫的设计思路: 5.为什么要选择Python做爬虫? 6.通用 ... python 下载文件限速-Python网络爬虫---scrapy通用爬虫及反爬技巧一.通用爬虫通用爬虫一般有以下通用特性:爬取大量(一般来说是无限)的网站而不是特定的一些网站. 不会将整个网站都爬取完毕,因为这十分不实际(或者说是不可能)完成的.相反,其会限制爬取的时间及数量. ... 网络爬虫--1.通用爬虫和聚焦爬虫文章目录一.前言二.通用爬虫 1.工作原理 2.通用爬虫的局限性三.聚焦爬虫一.前言根据使用场景,网络爬虫可分为通用爬虫和聚焦爬虫两种. 其中通用网络爬虫是捜索引擎抓取系统(Baid ... 聚焦爬虫与通用爬虫详解根据使用场景,网络爬虫可分为通用爬虫和聚焦爬虫两种. 通用爬虫通用爬虫:搜索引擎用的爬虫系统.搜索引擎和web服务商提供的爬虫. 目标: 就是尽可能的:把互联网上的所有的网页下载下来,放到本 ... Python爬虫——Scrapy通用爬虫大家好,我是霖hero 除了钱,大家还比较喜欢什么?当然是全能.万能和通用的人或事物啦,例如:全能.什么都会的员工.万能钥匙.通用爬虫等等.今天我们学习Scrapy通用爬虫,利用Scrapy通用爬虫来 ... 【0基础学爬虫】爬虫基础之爬虫的基本介绍大数据时代,各行各业对数据采集的需求日益增多,网络爬虫的运用也更为广泛,越来越多的人开始学习网络爬虫这项技术,本期为爬虫的基本介绍. 分享一些自己的爬虫项目,学习爬虫一些经验很不错基于python实 ... Python网络爬虫基础目录目录一.HTML基础 1.HTML相关概念&发展历史 1.1HTML及相关概念的介绍 1.2HTML发展历史编辑2.WEB标准.W3C/ECMA相关概念 2.1WEB标准的概念及组 ... 小白必学的爬虫基础（二）爬虫基础知识网络爬虫爬虫可以解决的问题爬虫工程师的进阶之路搜索引擎搜索引擎的主要组成搜索引擎的工作流程搜索引擎的局限性聚焦爬虫爬虫准备工作 robots协议 sitemap–网站地图 ... Spider第1课：爬虫基础爬虫准备参考资料精通python爬虫框架Scrapy,人民邮电出版社 Python3网路爬虫 Scrapy官方教程前提知识 url http协议 web前端,html,css,js ajax r ... python基础与爬虫笔记做java项目毕设前想通过爬虫爬取些数据存入数据库,从而在毕设中使用到,所以大致学习了一下python和爬虫做了下笔记. 大致过了一遍视频:https://www.bilibili.com/video ... 最新文章 PAT甲级1065 A+B and C (64bit)：[C++题解]爆long long，熟悉计算机存储有符号数原理 java机试数据结构_来看看阿里面试的一面都面了些什么笔试+机试（java岗） Oracle client 安装、配置 inur new.php id,Cmsez(随易)全站系统 0day Python机器学习：多项式回归与模型泛化005学习曲线最不靠谱CEO，一句话让公司损失上千亿元 leetcode刷题——415. 字符串相加 linux下部署tomcat的备忘 java url压缩_URL压缩算法的短地址 STP中各算法接口开销（COST）计算方式分区工具parted的详解及常用分区使用方法阶段1 语言基础+高级_1-3-Java语言高级_05-异常与多线程_第3节线程同步机制_2_线程安全问题的代码实现... centos7 mysql安装_CentOS7安装MySQL（详解）当在Windows上安装SQL Server，点击setup，出现以下错误0 x84b10001 iOS开发：苹果开发者账号第一次新建APP ID以及创建App的步骤 python输入一个数字n、计算1到n的和_怎么用python求1到n所有整数的和 J2EE的13个标准（规范） Git版本控制的使用火星辩证派第1期·精华版：为什么在DeFi这个赛道上国外项目异常火爆，中国市场却反应迟缓？... python全排列，递归热门文章 UnityShader——挺进体积光记在服务器部署selenium爬虫 Android布局——线性布局、相对布局、帧布局、表格布局、网格布局、约束布局 cesium编辑功能CesiumEdit.js 6713EDMA总结（一）【深度好文】Flink SQL流批⼀体化技术详解（一） LeetCode-SQL题目集永续合约_杠杆合约_合约交易基础知识用友U8案例教程固定资产后台配置晚上鸟沒事，白天没鸟事_鸟箱 Home Powered By WordPress 苏ICP备17004429号

零、爬虫基础（通用爬虫）

一、response响应参数

1、response六个参数

2、代码实现

二、urlretrieve下载文件

1、urlretrieve参数

2、代码实现

三、Get请求路由参数

1、单个参数

1、方法

2、代码实现

2、多个参数

1、方法

2、代码实现

四、Get AND Post

1、Get请求

2、Post请求

五、请求方式的定制

1、代码实现

六、Get方式获取Ajax

1、代码实现

七、简单封装

1、豆瓣、页面、封装

2、KFC、Json、封装

八、代理吃和快代理

1、代理池

2、快代理

九、cookiejar—全书网

1、代码实现

壹、爬虫进阶（聚焦爬虫）

十、XPath：XML路径语言

1、简介

2、xpath语法

3、安装步骤

4、代码实现

十一、JsonPath

1. 介绍

2、操作符

3、资料

零、爬虫基础（通用爬虫）相关推荐

最新文章

热门文章