selenium爬取煎蛋网

直接上代码

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions  as ES
import requests
import urllib.request
import os
from lxml import etree
t = 0
class Custer(object):driver_path = r"D:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe"def __init__(self):self.driver = webdriver.Chrome(executable_path=self.driver_path)self.url = "http://jandan.net/ooxx"def run(self):self.driver.get(self.url)while True:all_source = self.driver.page_sourcehtml = etree.HTML(all_source)self.xqy(html)WebDriverWait(self.driver,10).until(ES.presence_of_element_located((By.XPATH,"//div[@class='cp-pagenavi']/a[last()]")))try:Btn = self.driver.find_element_by_xpath("//div[@class='cp-pagenavi']/a[last()]")if "Older Comments" in Btn.get_attribute("title"):Btn.click()else:breakexcept:print("出现异常")def xqy(self,html):all_content = html.xpath("//div[@class='row']//div")all_author = all_content[0].xpath("//div[@class='author']/strong/text()")           #作者列表#*****************给自己的重点**********************#给列表重复元素加工  如果不加工进入字典会少很多元素for index,item in enumerate(all_author):global tif item in all_author[0:index]:                     #判断当前元素是否与之前元素重复  如果重复，则重命名t=t+1                               all_author[index] = item+str(t)                 #如多个重命名使作者加上字符1  依次类推#***************************************************
WebDriverWait(self.driver, 10).until(ES.presence_of_element_located((By.XPATH, "//div[@class='text']//img")))all_img = all_content[1].xpath("//div[@class='text']//img//@src")           #图片列表#解决有个张图片没有http：协议for index,item in enumerate(all_img):if 'http:' not in item:all_img[index] = 'http:'+itemdic = dict(zip(all_author,all_img))         #多个列表生产字典#遍历字典保存图片for key in dic:hz = os.path.splitext(dic[key])[1]          #取出后缀名.jpg/.pngfilename = key+hz                           #文件名（标题+后缀名）urllib.request.urlretrieve(dic[key],'images/'+filename)def main():rea = Custer()rea.run()if __name__ == '__main__':main()

爬取的图片

进阶

个人用了个多线程但不知道是不是多线程爬取　感觉爬取速度快多了

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions  as ES
import requests
import threading
import urllib.request
import os
from lxml import etree
t = 0
gCondition = threading.Condition()
class Custer(threading.Thread):driver_path = r"D:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe"driver = webdriver.Chrome(executable_path=driver_path)url = "http://jandan.net/ooxx"def run(self):self.driver.get(self.url)while True:all_source = self.driver.page_sourcehtml = etree.HTML(all_source)self.xqy(html)WebDriverWait(self.driver,10).until(ES.presence_of_element_located((By.XPATH,"//div[@class='cp-pagenavi']/a[last()]")))gCondition.acquire()        #加上锁(如果不加锁那么多个线程可能同时请求一个或多个图片)try:Btn = self.driver.find_element_by_xpath("//div[@class='cp-pagenavi']/a[last()]")if "Older Comments" in Btn.get_attribute("title"):gCondition.release()            #解锁
                    Btn.click()else:breakexcept:print("出现异常")def xqy(self,html):all_content = html.xpath("//div[@class='row']//div")all_author = all_content[0].xpath("//div[@class='author']/strong/text()")           #作者列表#*****************给自己的重点**********************#给列表重复元素加工  如果不加工进入字典会少很多元素for index,item in enumerate(all_author):global tif item in all_author[0:index]:                     #判断当前元素是否与之前元素重复  如果重复，则重命名t=t+1all_author[index] = item+str(t)                 #如多个重命名使作者加上字符 依次类推#***************************************************
WebDriverWait(self.driver, 10).until(ES.presence_of_element_located((By.XPATH, "//div[@class='text']//img")))all_img = all_content[1].xpath("//div[@class='text']//img//@src")           #图片列表#解决有个张图片没有http：协议for index,item in enumerate(all_img):if 'http:' not in item:all_img[index] = 'http:'+itemdic = dict(zip(all_author,all_img))         #多个列表生产字典#遍历字典保存图片for key in dic:hz = os.path.splitext(dic[key])[1]          #取出后缀名.jpg/.pngfilename = key+hz                           #文件名（标题+后缀名）urllib.request.urlretrieve(dic[key],'images/'+filename)def main():for i in range(9):rea = Custer()rea.start()if __name__ == '__main__':main()

转载于:https://www.cnblogs.com/c-pyday/p/10609812.html

selenium爬取煎蛋网相关推荐

利用Selenium爬取煎蛋网妹纸图原来是这么简单！！！
上期说到,下期更新爬取煎蛋网妹纸的selenium版本,它来了!!! 获取页面url信息获取图片url信息永久性保存图片注释: 想要获取GeckoDriver安装包的小伙伴,可以给博主留言或者搜 ...
python爬取煎蛋网妹子图
看了小甲鱼的python教学视频,爬取煎蛋网妹子图的代码已经不适用了,网上其它人的代码多有报错,本萌新结合其他人的方法,终于写出一个可行的,在此分享_(:з)∠)_(运行非常慢就是了) import ...
python 爬虫爬取煎蛋网妹子图，我1T的硬盘装满了！
前言大家好,这里是「brucepk」爬虫系列教程.此系列教程以实例项目为材料进行分析,从项目中学习 python 爬虫,跟着我一起学习,每天进步一点点. 煎蛋网站 image 很多朋友都反应学 p ...
爬取煎蛋网中的妹子图
看了小甲鱼爬取煎蛋网的视频,但是当时无论怎么写都写不出来,后来又看了一点其他视频,心血来潮把那段代码重写了一次,总算成功了,也算是解决一个心病吧,唯一美中不住的是还是不太会使用find函数,慢慢来吧, ...
python爬虫图片实例-Python爬虫爬取煎蛋网图片代码实例
这篇文章主要介绍了Python爬虫爬取煎蛋网图片代码实例,文中通过示例代码介绍的非常详细,对大家的学习或者工作具有一定的参考学习价值,需要的朋友可以参考下今天,试着爬取了煎蛋网的图片. 用到的包: ...
python爬图代码实例_Python爬虫爬取煎蛋网图片代码实例
这篇文章主要介绍了Python爬虫爬取煎蛋网图片代码实例,文中通过示例代码介绍的非常详细,对大家的学习或者工作具有一定的参考学习价值,需要的朋友可以参考下今天,试着爬取了煎蛋网的图片. 用到的包: ...
Python Scrapy 爬取煎蛋网妹子图实例（一）
前面介绍了爬虫框架的一个实例,那个比较简单,这里在介绍一个实例爬取煎蛋网妹子图,遗憾的是上周煎蛋网还有妹子图了,但是这周妹子图变成了随手拍, 不过没关系,我们爬图的目的是为了加强实战应用,管 ...
Python爬虫爬取煎蛋网图片代码实例
这篇文章主要介绍了Python爬虫爬取煎蛋网图片代码实例,文中通过示例代码介绍的非常详细,对大家的学习或者工作具有一定的参考学习价值,需要的朋友可以参考下今天,试着爬取了煎蛋网的图片. 用到的包: ...
python3网络爬虫：爬取煎蛋网美女照片
1.1 前言今天开学不久,课也不多,就想着来做个爬虫,看着好多老司机喜欢看美女图片,想做个爬去煎蛋网的小爬虫.哈哈,下面开车了,各位,上车记得滴卡参考: http://blog.csdn.net/ ...

selenium爬取煎蛋网

selenium爬取煎蛋网

进阶

selenium爬取煎蛋网相关推荐

最新文章

热门文章