python爬虫多线程多进程示例

# -*- coding: utf-8 -*-
'''美图录‘’‘
#-------------------------------------------------
import re
import os,stat
import time
import datetime
import urllib
import urllib.request
from bs4 import BeautifulSoup
import requests
from PIL import Image
from io import BytesIO
from threading import Thread
from multiprocessing import Process#-------------------------------------------------
class Web_IMG_Crawler(object):#初始化#def __init__(self,url,de_code):#self.url=url#self.de_code=de_code#self.soup=soup#self.file_path=file_path#self.file_name=file_name#self.count=count#反——反——爬——虫    def ant_Crawler(self,url):    header= {'User-Agent':'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)','Referer':''}FirstPAGE=re.findall(r"//(.+?)/",url)[0]Referer=str('http://'+FirstPAGE)header['Referer']=Refererreturn header#解析网址def JIEXI_WEB(self,url,de_code):open_url=urllib.request.urlopen(url)#请求网址并打开网址html=open_url.read().decode(de_code)#确定以什么编码读取网址///在浏览器中打开目标网址，右键即可查看目标网址的编码方式soup=BeautifulSoup(html,features="lxml")#创建 beautifulsoup 对象,并指定解析器为lxmlreturn soup    #网址list去重def QU_CHONG(self,all_url_list):end_list=[]  for element in all_url_list :if(element not in end_list):end_list.append(element)return end_list#-------------------------------------------------#保存图片def SAVE_IMG(self,file_path,url_img,file_name):try:#是否有这个路径if not os.path.exists(file_path):#创建路径os.makedirs(file_path)#获得图片后缀file_suffix = os.path.splitext(url_img)[1]print(file_suffix)#拼接图片名（包含路径）file_name ='{}{}{}{}'.format(file_path,os.sep,file_name,file_suffix)print(file_name)#下载图片，并保存到文件夹中urllib.request.urlretrieve(url_img,filename=file_name)except IOError as e:print("IOError")except Exception as e:print("Exception")def SAVE_IMAGE(self,IMG_link_set,url,file_path,file_name,sid):            """ def download(num):PAGE=str(sid+1)+'页'+str(num)+'张'file_name0=file_name+PAGE+'.jpg'html_IMG=requests.get(IMG_link_set[num],headers=Web_IMG_Crawler().ant_Crawler(url))try:image=Image.open(BytesIO(html_IMG.content))image.save(file_path+file_name0)print('第%s图片下载完成,图片名称：%s' % (PAGE,file_name0))except:print("IOError")  class My_imgThread(Thread):def __init__(self,num):Thread.__init__(self)self.num=numdef run(self):download(self.num)passthreadsimg=[]for num in range(len(IMG_link_set)):        j=My_imgThread(i)threadsimg.append(j)j.start()        for j in threads: j.join() """i=-1sum=1for url_img in IMG_link_set:i=i+1    url_img=IMG_link_set[i]PAGE=str(sid)+'页'+str(sum)+'张'file_name0=file_name+PAGE+'.jpg'sum=sum+1html_IMG=requests.get(url_img,headers=Web_IMG_Crawler().ant_Crawler(url))try:image=Image.open(BytesIO(html_IMG.content))image.save(file_path+file_name0)print('第%s图片下载完成,图片名称：%s' % (PAGE,file_name0))except:print("IOError")   #翻页网址更新获取def Nextall_page(self,soup,url):url_list=[]url_list0=[]url_list0.append(url)stype_url=re.findall(r"item/(.+?).html",url)[0]for list_url in soup.find_all('a',href=re.compile(stype_url)):urlget=list_url.get('href')url_list.append(urlget)url_list=Web_IMG_Crawler().QU_CHONG(url_list)web_url_len=int(re.findall(r"_(.+?).html",url_list[-1])[0])+1      urlf=re.findall(r"(.+?).html",url)[0]for urllast in range(2,web_url_len):url_list_all=urlf+'_'+str(urllast)+'.html'url_list0.append(url_list_all)return url_list0#获取图片网址listdef HQ_IMG_SQL(self,soup,url,file_path,file_name,sid):"""根据源输入网址特征获取目标网址方法stype_url=re.findall(r"item/(.+?).html",url)[0]src=re.compile(stype_url)src.get("src")"""stype_url=re.findall(r"item/(.+?).html",url)[0]url_IMG_list=[]IMG_link_set=[]for src in soup.find_all(src=re.compile(stype_url)):#提取图片网址url_img=src.get("src")url_IMG_list.append(url_img)IMG_link_set=Web_IMG_Crawler().QU_CHONG(url_IMG_list)#return IMG_link_setWeb_IMG_Crawler().SAVE_IMAGE(IMG_link_set,url,file_path,file_name,sid)def HQ_IMG_SQL2(self,soup):"""根据网址源代码中jpg特征，获取目标网址方法originalsrc=re.compile("jpg")src.get("originalsrc")"""url_IMG_list2=[]IMG_link_set2=[]for src in soup.find_all(originalsrc=re.compile("jpg")):#提取图片网址url_img=src.get("originalsrc")url_IMG_list2.append(url_img)IMG_link_set2=Web_IMG_Crawler().QU_CHONG(url_IMG_list2)return IMG_link_set2#-------------------------------------------------  def NAME_IMG(self,soup):"""获取图片名称，根据title检索title.get("title")"""for title in soup.find_all(originalsrc=re.compile("jpg")):#提取图片网址name_img0=title.get("title") return name_img0def NAME_IMG2(self,soup):"""获取图片名称，根据alt检索alt.get("alt")"""for alt in soup.find_all(src=re.compile("jpg")):#提取图片网址name_img2=alt.get("alt")return name_img2  def searchurl():search_url=input("请输入目标网址：",)#search_url=r'此处网址被和谐了'de_code=r'utf-8'soup=Web_IMG_Crawler().JIEXI_WEB(search_url,de_code)searchurl_list=[]for list_url in soup.find_all('a',href=re.compile('item')):searchurlget=list_url.get('href')searchurl_list.append(searchurlget)searchurl_list=Web_IMG_Crawler().QU_CHONG(searchurl_list)return searchurl_listdef TOopen(url):def thread_Open(sid):soup=Web_IMG_Crawler().JIEXI_WEB(NEXTurl_list[sid],de_code)Web_IMG_Crawler().HQ_IMG_SQL(soup,url,file_path,file_name,sid)    class My_Thread(Thread):def __init__(self,sid):Thread.__init__(self)self.sid=sid        def run(self):thread_Open(self.sid)passurl=url#url=r'此处网址被和谐了'#de_code=input("请输入网页编码类型:",) de_code=r'utf-8'soup=Web_IMG_Crawler().JIEXI_WEB(url,de_code)file_path='F:/图片/Saved Pictures/'#file_path='../Download/'file_name=Web_IMG_Crawler().NAME_IMG2(soup)NEXTurl_list=Web_IMG_Crawler().Nextall_page(soup,url)threads=[]for i in range(len(NEXTurl_list)):        t=My_Thread(i)threads.append(t)t.start()        for t in threads: t.join()      class My_searchThread(Process):def __init__(self,url):Process.__init__(self)self.url=urldef run(self):TOopen(self.url)pass
class My_searchThread0(Thread):def __init__(self,url):Thread.__init__(self)self.url=urldef run(self):TOopen(self.url)pass
#-------------------------------------------------
if __name__ == "__main__":begintime=time.strftime("%Y-%m-%d %H:%M:%S")starttime = datetime.datetime.now()threadssearch=[]for url in searchurl():        y=My_searchThread0(url)threadssearch.append(y)y.start()        for y in threadssearch: y.join() endtime=time.strftime("%Y-%m-%d %H:%M:%S")finishtime=datetime.datetime.now()runtime=(finishtime-starttime).secondstimespan='%f' %runtime    print( 'Time Begin:'+begintime)print( 'Time End:'+endtime)print( 'Time Span:'+timespan+' s')exit()

python爬虫多线程多进程示例相关推荐

python代理ip多进程_静听网+python爬虫+多线程+多进程+构建IP代理池
目标网站:静听网网站url:http://www.audio699.com/ 目标文件:所有在线听的音频文件附:我有个喜好就是晚上睡觉听有声书,然而很多软件都是付费才能听,免费在线网站虽然能听,但 ...
静听网+python爬虫+多线程+多进程+构建IP代理池
目标网站:静听网网站url:http://www.audio699.com/ 目标文件:所有在线听的音频文件附:我有个喜好就是晚上睡觉听有声书,然而很多软件都是付费才能听,免费在线网站虽然能听,但 ...
创新实训-python爬虫多线程|解决中文乱码问题|卡片向上浮动效果|图文切换
创新实训-python爬虫多线程|乱码问题|前端样式重新修改考完毛概,把上周的工作总结一下.爬虫在第一周的时候只爬了一个就业指导这一个模块,这一次又加了招聘服务模块,所以就用了两个线程.前端首页一开 ...
python爬虫多线程爬取网站排行榜上的所有小说
python爬虫,多线程批量爬取多部小说欢迎阅读环境获取排行榜top上的所有小说地址分析排行榜top100页面的html,获取所有小说的url 遍历小说列表url到生成txt 多线程启动爬取任 ...
python爬虫多线程下载_Python爬虫之多线程下载豆瓣Top250电影图片
爬虫项目介绍本次爬虫项目将爬取豆瓣Top250电影的图片,其网址为:https://movie.douban.com/top250, 具体页面如下图所示: 本次爬虫项目将分别不使用多线程和使用多线程 ...
python爬虫多线程是什么意思_python爬虫中多线程的使用详解
queue介绍 queue是python的标准库,俗称队列.可以直接import引用,在python2.x中,模块名为Queue.python3直接queue即可在python中,多个线程之间的数据 ...
Python爬虫多线程提升数据下载的性能优化
Pyhton爬虫多线程提升数据下载的性能优化很幸运地上了两次Tony老师关于python爬虫的课(收获巨多),在这里我对第一次课做一下知识总结: 1.什么是爬虫? 自动从网络上进行数据采集的程序一 ...
python爬虫多线程是什么意思_python爬虫14 | 就这么说吧，如果你不懂多线程和线程池，那就去河边摸鱼！...
你知道吗? 在我的心里你是多么的重要就像恩请允许我来一段 freestyle 你们准备好了妹油你看这个碗它又大又圆就像这条面它又长又宽你们在这里看文章觉得很开心就像我 ...
Python中多线程多进程与协程的区别
进程:一个运行的程序(代码)就是一个进程,没有运行的代码叫程序,进程是系统资源分配的最小单位,进程拥有自己独立的内存空间,所以进程间数据不共享,开销大. 线程: 调度执行的最小单位,也叫执行路径,不能 ...

python爬虫多线程多进程示例

python爬虫多线程多进程示例相关推荐

最新文章

热门文章