开启线程池示例

import time
import threading
from concurrent.futures import ThreadPoolExecutorpool =ThreadPoolExecutor(100)
spider_list = []#爬虫方法
#page/url  代表爬取第几页或爬取第几个详情url
#def func1(url):
def func1(page):print("a",page)pages = 50 #urls=50 # 参数可以是:列表页数/商品列表urls
for page in range(pages):# 正在运行的线程id# print('running thread id : %d   now=%d' % (threading.get_ident(), url))print('running thread id : %d   now=%d' % (threading.get_ident(), page))# 将列表页数或商品列表url提交到函数方法# str_url = pool.submit(func1, url)str_page = pool.submit(func1,page)# 完成抓取数据列表舔加到spider# spider_list.append(str_url)spider_list.append(str_page)print("spider_list=",spider_list)for list in spider_list:# 完成的结果list.result()
print('线程全部执行完毕')

一、多线程爬取京东投诉信息

#!/usr/bin/env python
# -*- coding=utf-8 -*-import json
import threading
import time
from requests_html import HTMLSession
from concurrent.futures import ThreadPoolExecutor
import warnings
warnings.filterwarnings("ignore")session = HTMLSession()
proxies =None# 线程池
pool = ThreadPoolExecutor(30)
big_list = []
pool_list = []def dewu_company(pages):# 爬取第几页print("第"+str(pages)+"页")t=str(int(time.time()*1000))url = "https://tousu.sina.com.cn/api/company/received_complaints"headers ={"authority": "tousu.sina.com.cn","method": "GET","path": "/api/company/received_complaints?callback=jQuery11120045959640946885205_1584672560291&couid=7046706808&type=1&page_size=10&page=4&_=1584672560295","scheme": "https","accept": "text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01","accept-encoding": "gzip, deflate, br","accept-language": "zh-CN,zh;q=0.9,en;q=0.8",# "cookie": "SINAGLOBAL=183.192.8.132_1582353209.520903; UOR=www.baidu.com,tech.sina.com.cn,; __gads=ID=f98d68da093a23f1:T=1582381171:S=ALNI_MaF2_0MnhLJR0sTx2rAnjRyQXnm7w; UM_distinctid=1706d4483c137a-085e50f1dd69ca-37c143e-144000-1706d4483c23fd; lxlrttp=1578733570; U_TRS1=00000017.83e666c5.5e5a9b19.a73e93a1; Apache=58.246.234.18_1584426545.621701; U_TRS2=00000012.37245179.5e706e32.c9bedf3c; TOUSU-SINA-CN=; ULV=1584600854277:2:1:1:58.246.234.18_1584426545.621701:1582381166398; ULOGIN_IMG=tc-6e5332f21698ea872b48ddbbb7971af59a85; CNZZDATA1273941306=1828399964-1584599137-%7C1584669947","referer": "https://tousu.sina.com.cn/company/view/?couid=7046706808","user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36","x-requested-with": "XMLHttpRequest",}params = {# "callback":"jQuery11120045959640946885205_1584672560291","couid":"5650743478",  # 1878960481 阿里  5650743478 京东 # 得物 7046706808"type":"1","page_size":"10","page":pages,"_":t,}res = session.get(url, params=params, headers=headers, proxies=proxies, verify=False,timeout=5)# print(res.text)info_list = res.json()["result"]["data"]["complaints"]for info in info_list:# title = info.get("title")# uid = info.get("uid")# summary = info.get("summary")info_url = 'https:' + info['main']['url']# print(info_url)return parse_detail(info_url, info)def parse_detail(info_url,info):#https://tousu.sina.com.cn/complaint/view/17349163730/try:res = session.get(info_url,  proxies=proxies, verify=False)# print(res.text)new_dict = dict()new_dict['投诉编号'] = res.html.xpath('//div[@class="ts-d-question"]/ul/li[1]/text()')[0]new_dict['投诉对象'] = res.html.xpath('//div[@class="ts-d-question"]/ul/li[2]/a//text()')[0]new_dict['投诉问题'] = res.html.xpath('//div[@class="ts-d-question"]/ul/li[3]/text()')[0]new_dict['投诉要求'] = res.html.xpath('//div[@class="ts-d-question"]/ul/li[4]/text()')[0]new_dict['涉诉金额'] = res.html.xpath('//div[@class="ts-d-question"]/ul/li[5]/text()')[0]new_dict['投诉进度'] = res.html.xpath('//div[@class="ts-d-question"]/ul/li[6]/b/text()')[0]new_dict['投诉进程详情'] = res.html.xpath('//*[@class="ts-d-steplist"]')[0].text# 获取投诉图片img_info_list=[]img_url = res.html.xpath('//*[@class="example-image-link"]/@href')for url in img_url:img_info_list.append("https:"+url)new_dict['投诉图片'] = img_info_list# 获取视频列表vide_id_list = res.html.xpath('//*[@class="video-ico"]/@data-id')if len(vide_id_list)>=1:# 投诉视频详情new_vide_list = []if vide_id_list:for vide_id in vide_id_list:t = int(time.time())vide_info_url = f'https://api.ivideo.sina.com.cn/public/video/play?video_id={vide_id}&appver=V11220.191231.02&appname=sinaplayer_pc&applt=web&tags=sinaplayer_pc&player=all&jsonp=&plid=2019090301&prid=&uid=&tid=&pid=1&ran=0.34558379845592846&r=https%3A%2F%2Ftousu.sina.com.cn%2Fcomplaint%2Fview%2F17349160365%2F&referrer=&ssid=gusr_pc_{t}&preload=0&uu=60.180.189.200_1581579174.788519&isAuto=1'# print("vide_info_url=" ,vide_info_url)res = session.get(vide_info_url, verify=False)# result = res.encode('utf-8').decode('unicode_escape')result = json.loads(res.text)# print("result =",type(result))new_vide_list.append(result)if new_vide_list:new_dict['投诉视频详情'] = new_vide_listinfo['投诉详情'] = new_dict# else:#     new_dict['投诉视频详情'] = None#     info['投诉详情'] = new_dictbig_list.append(new_dict)print("big_list==",big_list)except Exception as e:print(e)# 写入json 文件with open('京东投诉信息.json', "a+", encoding = 'utf-8') as fw:fw.write(json.dumps(big_list,ensure_ascii=False ) + '\n')def main(pages):startTime = time.time()# 爬取页数for page in range(pages):name = pool.submit(dewu_company,page)pool_list.append(name)for n in pool_list:n.result()print("全部结束并保存本地")# 以下写入json文件不能换行,那位大神可以指点下# with open('京东投诉信息.json', "a+", encoding = 'utf-8') as fw:#     fw.write(json.dumps(big_list,ensure_ascii=False ) + '\n')endTime = time.time()print('Done, Time cost: %s ' % (endTime - startTime))if __name__ == '__main__':# 输入爬取页数main(20)

20页数据爬取时间:Done, Time cost: 1.6854908466339111

二、多线程爬取阿里详情投诉信息

#!/usr/bin/env python
# -*- coding=utf-8 -*-import json
import threading
import time
from requests_html import HTMLSession
from concurrent.futures import ThreadPoolExecutor
import warnings
warnings.filterwarnings("ignore")session = HTMLSession()
proxies =Nonedef dewu_company():for page in range(1,20):print("第"+str(page)+"页")t=str(int(time.time()*1000))url = "https://tousu.sina.com.cn/api/company/received_complaints"headers ={"authority": "tousu.sina.com.cn","method": "GET","path": "/api/company/received_complaints?callback=jQuery11120045959640946885205_1584672560291&couid=7046706808&type=1&page_size=10&page=4&_=1584672560295","scheme": "https","accept": "text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01","accept-encoding": "gzip, deflate, br","accept-language": "zh-CN,zh;q=0.9,en;q=0.8",# "cookie": "SINAGLOBAL=183.192.8.132_1582353209.520903; UOR=www.baidu.com,tech.sina.com.cn,; __gads=ID=f98d68da093a23f1:T=1582381171:S=ALNI_MaF2_0MnhLJR0sTx2rAnjRyQXnm7w; UM_distinctid=1706d4483c137a-085e50f1dd69ca-37c143e-144000-1706d4483c23fd; lxlrttp=1578733570; U_TRS1=00000017.83e666c5.5e5a9b19.a73e93a1; Apache=58.246.234.18_1584426545.621701; U_TRS2=00000012.37245179.5e706e32.c9bedf3c; TOUSU-SINA-CN=; ULV=1584600854277:2:1:1:58.246.234.18_1584426545.621701:1582381166398; ULOGIN_IMG=tc-6e5332f21698ea872b48ddbbb7971af59a85; CNZZDATA1273941306=1828399964-1584599137-%7C1584669947","referer": "https://tousu.sina.com.cn/company/view/?couid=7046706808","user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36","x-requested-with": "XMLHttpRequest",}params = {# "callback":"jQuery11120045959640946885205_1584672560291","couid":"1878960481",  # 1878960481 阿里  5650743478 京东 # 得物 7046706808"type":"1","page_size":"10","page":page,"_":t,}res = session.get(url, params=params, headers=headers, proxies=proxies, verify=False,timeout=5)# print(res.text)# 开启线程调用商品详情打开ths = []info_list = res.json()["result"]["data"]["complaints"]for info in info_list:# title = info.get("title")# uid = info.get("uid")# summary = info.get("summary")info_url = 'https:' + info['main']['url']# print(info_url)## 开启线程调用商品详情th = threading.Thread(target=parse_detail,args=(info_url,info))th.start()ths.append(th)if len(ths) > 10:for th_one in ths:th_one.join()ths = []for th_one in ths:th_one.join()def parse_detail(info_url,info):#https://tousu.sina.com.cn/complaint/view/17349163730/try:big_list = []res = session.get(info_url,  proxies=proxies, verify=False)# print(res.text)new_dict = dict()new_dict['投诉编号'] = res.html.xpath('//div[@class="ts-d-question"]/ul/li[1]/text()')[0]new_dict['投诉对象'] = res.html.xpath('//div[@class="ts-d-question"]/ul/li[2]/a//text()')[0]new_dict['投诉问题'] = res.html.xpath('//div[@class="ts-d-question"]/ul/li[3]/text()')[0]new_dict['投诉要求'] = res.html.xpath('//div[@class="ts-d-question"]/ul/li[4]/text()')[0]new_dict['涉诉金额'] = res.html.xpath('//div[@class="ts-d-question"]/ul/li[5]/text()')[0]new_dict['投诉进度'] = res.html.xpath('//div[@class="ts-d-question"]/ul/li[6]/b/text()')[0]new_dict['投诉进程详情'] = res.html.xpath('//*[@class="ts-d-steplist"]')[0].text# 获取投诉图片img_info_list=[]img_url = res.html.xpath('//*[@class="example-image-link"]/@href')for url in img_url:img_info_list.append("https:"+url)new_dict['投诉图片'] = img_info_list# 获取视频列表vide_id_list = res.html.xpath('//*[@class="video-ico"]/@data-id')if len(vide_id_list)>=1:# print("vide_id_list=",vide_id_list)# 投诉视频详情new_vide_list = []if vide_id_list:for vide_id in vide_id_list:t = int(time.time())vide_info_url = f'https://api.ivideo.sina.com.cn/public/video/play?video_id={vide_id}&appver=V11220.191231.02&appname=sinaplayer_pc&applt=web&tags=sinaplayer_pc&player=all&jsonp=&plid=2019090301&prid=&uid=&tid=&pid=1&ran=0.34558379845592846&r=https%3A%2F%2Ftousu.sina.com.cn%2Fcomplaint%2Fview%2F17349160365%2F&referrer=&ssid=gusr_pc_{t}&preload=0&uu=60.180.189.200_1581579174.788519&isAuto=1'# print("vide_info_url=" ,vide_info_url)res = session.get(vide_info_url, verify=False)# result = res.encode('utf-8').decode('unicode_escape')result = json.loads(res.text)# print("result =",type(result))new_vide_list.append(result)if new_vide_list:new_dict['投诉视频详情'] = new_vide_listinfo['投诉详情'] = new_dict# else:#     new_dict['投诉视频详情'] = None#     info['投诉详情'] = new_dictbig_list.append(new_dict)print("big_list==",big_list,len(big_list))except Exception as e:print(e)with open('阿里投诉信息.json', "a+", encoding = 'utf-8') as fw:fw.write(json.dumps(big_list,ensure_ascii=False ) + '\n')if __name__ == '__main__':startTime = time.time()dewu_company()endTime = time.time()print ('Done, Time cost: %s ' % (endTime - startTime))

20页数据爬取时间:Done, Time cost: 20.348562240600586

python3多线程爬取京东投诉信息相关推荐

  1. go爬虫和python爬虫哪个好_python 爬虫实战项目--爬取京东商品信息(价格、优惠、排名、好评率等)-Go语言中文社区...

    利用splash爬取京东商品信息 一.环境 window7 python3.5 pycharm scrapy scrapy-splash MySQL 二.简介 为了体验scrapy-splash 的动 ...

  2. python爬取京东商品信息代码_爬取京东商品信息

    利用 BeautifulSoup + Requests 爬取京东商品信息并保存在Excel中 一.查看网页信息 打开京东商城,随便输入个商品,就选固态硬盘吧 先看看 URL 的规律,可以看到我们输入的 ...

  3. layui获取input信息_python爬虫—用selenium爬取京东商品信息

    python爬虫--用selenium爬取京东商品信息 1.先附上效果图(我偷懒只爬了4页) 2.京东的网址https://www.jd.com/ 3.我这里是不加载图片,加快爬取速度,也可以用Hea ...

  4. python爬取京东商品属性_python爬虫小项目:爬取京东商品信息

    #爬取京东手机信息 import requests from bs4 import BeautifulSoup from selenium import webdriver import re imp ...

  5. Day06,selenium的剩余用法、万能登录破解和爬取京东商品信息,及破解极验滑动验证码...

    一.自动登录抽屉新热榜 from selenium import webdriver import timedriver = webdriver.Chrome(r'D:\BaiduNetdiskDow ...

  6. python爬虫爬取京东商品评价_python爬取京东商品信息及评论

    ''' 爬取京东商品信息: 功能: 通过chromeDrive进行模拟访问需要爬取的京东商品详情页(https://item.jd.com/100003196609.html)并且程序支持多个页面爬取 ...

  7. Python爬取京东书籍信息(包括书籍评论数、简介等)

    Python爬取京东书籍信息(包括书籍评论数.简介等) 一. 工具 二. 准备爬取的数据 三. 内容简述 四. 页面分析 (一) 寻找目录中商品所在标签 (二) 寻找页面中能爬取到的所有数据 (三) ...

  8. python爬虫爬商品库存_python爬虫实践——爬取京东商品信息

    1 ''' 2 爬取京东商品信息:3 请求url:4 https://www.jd.com/5 提取商品信息:6 1.商品详情页7 2.商品名称8 3.商品价格9 4.评价人数10 5.商品商家11 ...

  9. Selenium介绍--实例爬取京东商品信息与图片

    目录 一.Selenium简介 二.Selenium组成 三.Selenium特点 四.Selenium的基本使用 1.下载所用浏览器需要的驱动 2.创建项目并导入依赖 3.入门 3.代码演示 五.实 ...

  10. 爬虫实战:使用Selenium爬取京东宝贝信息

    有些页面数据是采用Ajax获取的,但是这些Ajax接口参数比较复杂,可能会加入加密秘钥.对于这种页面,最方便的方法是通过selenium.可以用Selenium来模拟浏览器操作,抓取京东商品信息. 网 ...

最新文章

  1. 不属于JAVA类中的变量_在Java中,不属于整数类型变量的是( )。_学小易找答案...
  2. linux线程堆分配,如何在Linux中的相同进程下为线程分配堆栈或内存
  3. linux mysql 端口配置文件_Linux配置测试环境,部署项目(指定端口,数据库连接)...
  4. Nemo for transfer learning
  5. SVN 分支/合并/切换
  6. NodeJS+Express+MongoDB - 张果 - 博客园
  7. P3308-[SDOI2014]LIS【最小割】
  8. Apache手工编译安装(内附软件包)
  9. security安全表达式介绍
  10. windows下用C/C++访问MySQL数据库
  11. 大数据之-Hadoop完全分布式_SCP案例_同时在1000台服务器上安装JDK_配置环境变量---大数据之hadoop工作笔记0031
  12. [摘抄]游戏主循环逻辑
  13. html + CSS
  14. 最新达内大数据视频教程
  15. 计算机主板型号尺寸,10大华硕主板型号简介,组装电脑的朋友可别错过
  16. 数字鉴相器matlab,一种数字鉴相器的设计.pdf
  17. 【Tableau Desktop 企业日常问题20】Tableau怎么折线变虚线?
  18. 小觅摄像头 VINS-MONO安装
  19. 360极速浏览器屏蔽百度广告
  20. 【零基础深度学习教程第一课:深度学习基础】

热门文章

  1. HDU 5857 Median(水~)
  2. Failure obtaining db row lock: No row exists in table QRTZ_LOCKS for lock named
  3. 百融大数据自助查询_【百融云创科技股份有限公司现在大数据公司被查,百融受影响了吗】-看准网...
  4. 重庆大学计算机学院王臣,重庆大学UMD研究生网络文化工作室学生干部述职大会暨换届选举大会圆满落幕...
  5. Zigbee之旅(十):综合小实验——基于CC2430的温度监测系统(转)
  6. linux指定网卡走流量,Linux实时输出指定网卡流量
  7. ThinkPad Win7更改鼠标滚轮设置问题
  8. 移动开发技术的进化历程(原生开发与跨平台技术)
  9. 写在博士旅程之前|博士第一年|博士第三年|博士第四年
  10. java队列和栈 共同_java 栈和队列的模拟--java