开启线程池示例

import time
import threading
from concurrent.futures import ThreadPoolExecutorpool =ThreadPoolExecutor(100)
spider_list = []#爬虫方法
#page/url  代表爬取第几页或爬取第几个详情url
#def func1(url):
def func1(page):print("a",page)pages = 50 #urls=50 # 参数可以是:列表页数/商品列表urls
for page in range(pages):# 正在运行的线程id# print('running thread id : %d   now=%d' % (threading.get_ident(), url))print('running thread id : %d   now=%d' % (threading.get_ident(), page))# 将列表页数或商品列表url提交到函数方法# str_url = pool.submit(func1, url)str_page = pool.submit(func1,page)# 完成抓取数据列表舔加到spider# spider_list.append(str_url)spider_list.append(str_page)print("spider_list=",spider_list)for list in spider_list:# 完成的结果list.result()
print('线程全部执行完毕')

一、多线程爬取京东投诉信息

#!/usr/bin/env python
# -*- coding=utf-8 -*-import json
import threading
import time
from requests_html import HTMLSession
from concurrent.futures import ThreadPoolExecutor
import warnings
warnings.filterwarnings("ignore")session = HTMLSession()
proxies =None# 线程池
pool = ThreadPoolExecutor(30)
big_list = []
pool_list = []def dewu_company(pages):# 爬取第几页print("第"+str(pages)+"页")t=str(int(time.time()*1000))url = "https://tousu.sina.com.cn/api/company/received_complaints"headers ={"authority": "tousu.sina.com.cn","method": "GET","path": "/api/company/received_complaints?callback=jQuery11120045959640946885205_1584672560291&couid=7046706808&type=1&page_size=10&page=4&_=1584672560295","scheme": "https","accept": "text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01","accept-encoding": "gzip, deflate, br","accept-language": "zh-CN,zh;q=0.9,en;q=0.8",# "cookie": "SINAGLOBAL=183.192.8.132_1582353209.520903; UOR=www.baidu.com,tech.sina.com.cn,; __gads=ID=f98d68da093a23f1:T=1582381171:S=ALNI_MaF2_0MnhLJR0sTx2rAnjRyQXnm7w; UM_distinctid=1706d4483c137a-085e50f1dd69ca-37c143e-144000-1706d4483c23fd; lxlrttp=1578733570; U_TRS1=00000017.83e666c5.5e5a9b19.a73e93a1; Apache=58.246.234.18_1584426545.621701; U_TRS2=00000012.37245179.5e706e32.c9bedf3c; TOUSU-SINA-CN=; ULV=1584600854277:2:1:1:58.246.234.18_1584426545.621701:1582381166398; ULOGIN_IMG=tc-6e5332f21698ea872b48ddbbb7971af59a85; CNZZDATA1273941306=1828399964-1584599137-%7C1584669947","referer": "https://tousu.sina.com.cn/company/view/?couid=7046706808","user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36","x-requested-with": "XMLHttpRequest",}params = {# "callback":"jQuery11120045959640946885205_1584672560291","couid":"5650743478",  # 1878960481 阿里  5650743478 京东 # 得物 7046706808"type":"1","page_size":"10","page":pages,"_":t,}res = session.get(url, params=params, headers=headers, proxies=proxies, verify=False,timeout=5)# print(res.text)info_list = res.json()["result"]["data"]["complaints"]for info in info_list:# title = info.get("title")# uid = info.get("uid")# summary = info.get("summary")info_url = 'https:' + info['main']['url']# print(info_url)return parse_detail(info_url, info)def parse_detail(info_url,info):#https://tousu.sina.com.cn/complaint/view/17349163730/try:res = session.get(info_url,  proxies=proxies, verify=False)# print(res.text)new_dict = dict()new_dict['投诉编号'] = res.html.xpath('//div[@class="ts-d-question"]/ul/li[1]/text()')[0]new_dict['投诉对象'] = res.html.xpath('//div[@class="ts-d-question"]/ul/li[2]/a//text()')[0]new_dict['投诉问题'] = res.html.xpath('//div[@class="ts-d-question"]/ul/li[3]/text()')[0]new_dict['投诉要求'] = res.html.xpath('//div[@class="ts-d-question"]/ul/li[4]/text()')[0]new_dict['涉诉金额'] = res.html.xpath('//div[@class="ts-d-question"]/ul/li[5]/text()')[0]new_dict['投诉进度'] = res.html.xpath('//div[@class="ts-d-question"]/ul/li[6]/b/text()')[0]new_dict['投诉进程详情'] = res.html.xpath('//*[@class="ts-d-steplist"]')[0].text# 获取投诉图片img_info_list=[]img_url = res.html.xpath('//*[@class="example-image-link"]/@href')for url in img_url:img_info_list.append("https:"+url)new_dict['投诉图片'] = img_info_list# 获取视频列表vide_id_list = res.html.xpath('//*[@class="video-ico"]/@data-id')if len(vide_id_list)>=1:# 投诉视频详情new_vide_list = []if vide_id_list:for vide_id in vide_id_list:t = int(time.time())vide_info_url = f'https://api.ivideo.sina.com.cn/public/video/play?video_id={vide_id}&appver=V11220.191231.02&appname=sinaplayer_pc&applt=web&tags=sinaplayer_pc&player=all&jsonp=&plid=2019090301&prid=&uid=&tid=&pid=1&ran=0.34558379845592846&r=https%3A%2F%2Ftousu.sina.com.cn%2Fcomplaint%2Fview%2F17349160365%2F&referrer=&ssid=gusr_pc_{t}&preload=0&uu=60.180.189.200_1581579174.788519&isAuto=1'# print("vide_info_url=" ,vide_info_url)res = session.get(vide_info_url, verify=False)# result = res.encode('utf-8').decode('unicode_escape')result = json.loads(res.text)# print("result =",type(result))new_vide_list.append(result)if new_vide_list:new_dict['投诉视频详情'] = new_vide_listinfo['投诉详情'] = new_dict# else:#     new_dict['投诉视频详情'] = None#     info['投诉详情'] = new_dictbig_list.append(new_dict)print("big_list==",big_list)except Exception as e:print(e)# 写入json 文件with open('京东投诉信息.json', "a+", encoding = 'utf-8') as fw:fw.write(json.dumps(big_list,ensure_ascii=False ) + '\n')def main(pages):startTime = time.time()# 爬取页数for page in range(pages):name = pool.submit(dewu_company,page)pool_list.append(name)for n in pool_list:n.result()print("全部结束并保存本地")# 以下写入json文件不能换行,那位大神可以指点下# with open('京东投诉信息.json', "a+", encoding = 'utf-8') as fw:#     fw.write(json.dumps(big_list,ensure_ascii=False ) + '\n')endTime = time.time()print('Done, Time cost: %s ' % (endTime - startTime))if __name__ == '__main__':# 输入爬取页数main(20)

20页数据爬取时间：Done, Time cost: 1.6854908466339111

二、多线程爬取阿里详情投诉信息

#!/usr/bin/env python
# -*- coding=utf-8 -*-import json
import threading
import time
from requests_html import HTMLSession
from concurrent.futures import ThreadPoolExecutor
import warnings
warnings.filterwarnings("ignore")session = HTMLSession()
proxies =Nonedef dewu_company():for page in range(1,20):print("第"+str(page)+"页")t=str(int(time.time()*1000))url = "https://tousu.sina.com.cn/api/company/received_complaints"headers ={"authority": "tousu.sina.com.cn","method": "GET","path": "/api/company/received_complaints?callback=jQuery11120045959640946885205_1584672560291&couid=7046706808&type=1&page_size=10&page=4&_=1584672560295","scheme": "https","accept": "text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01","accept-encoding": "gzip, deflate, br","accept-language": "zh-CN,zh;q=0.9,en;q=0.8",# "cookie": "SINAGLOBAL=183.192.8.132_1582353209.520903; UOR=www.baidu.com,tech.sina.com.cn,; __gads=ID=f98d68da093a23f1:T=1582381171:S=ALNI_MaF2_0MnhLJR0sTx2rAnjRyQXnm7w; UM_distinctid=1706d4483c137a-085e50f1dd69ca-37c143e-144000-1706d4483c23fd; lxlrttp=1578733570; U_TRS1=00000017.83e666c5.5e5a9b19.a73e93a1; Apache=58.246.234.18_1584426545.621701; U_TRS2=00000012.37245179.5e706e32.c9bedf3c; TOUSU-SINA-CN=; ULV=1584600854277:2:1:1:58.246.234.18_1584426545.621701:1582381166398; ULOGIN_IMG=tc-6e5332f21698ea872b48ddbbb7971af59a85; CNZZDATA1273941306=1828399964-1584599137-%7C1584669947","referer": "https://tousu.sina.com.cn/company/view/?couid=7046706808","user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36","x-requested-with": "XMLHttpRequest",}params = {# "callback":"jQuery11120045959640946885205_1584672560291","couid":"1878960481",  # 1878960481 阿里  5650743478 京东 # 得物 7046706808"type":"1","page_size":"10","page":page,"_":t,}res = session.get(url, params=params, headers=headers, proxies=proxies, verify=False,timeout=5)# print(res.text)# 开启线程调用商品详情打开ths = []info_list = res.json()["result"]["data"]["complaints"]for info in info_list:# title = info.get("title")# uid = info.get("uid")# summary = info.get("summary")info_url = 'https:' + info['main']['url']# print(info_url)## 开启线程调用商品详情th = threading.Thread(target=parse_detail,args=(info_url,info))th.start()ths.append(th)if len(ths) > 10:for th_one in ths:th_one.join()ths = []for th_one in ths:th_one.join()def parse_detail(info_url,info):#https://tousu.sina.com.cn/complaint/view/17349163730/try:big_list = []res = session.get(info_url,  proxies=proxies, verify=False)# print(res.text)new_dict = dict()new_dict['投诉编号'] = res.html.xpath('//div[@class="ts-d-question"]/ul/li[1]/text()')[0]new_dict['投诉对象'] = res.html.xpath('//div[@class="ts-d-question"]/ul/li[2]/a//text()')[0]new_dict['投诉问题'] = res.html.xpath('//div[@class="ts-d-question"]/ul/li[3]/text()')[0]new_dict['投诉要求'] = res.html.xpath('//div[@class="ts-d-question"]/ul/li[4]/text()')[0]new_dict['涉诉金额'] = res.html.xpath('//div[@class="ts-d-question"]/ul/li[5]/text()')[0]new_dict['投诉进度'] = res.html.xpath('//div[@class="ts-d-question"]/ul/li[6]/b/text()')[0]new_dict['投诉进程详情'] = res.html.xpath('//*[@class="ts-d-steplist"]')[0].text# 获取投诉图片img_info_list=[]img_url = res.html.xpath('//*[@class="example-image-link"]/@href')for url in img_url:img_info_list.append("https:"+url)new_dict['投诉图片'] = img_info_list# 获取视频列表vide_id_list = res.html.xpath('//*[@class="video-ico"]/@data-id')if len(vide_id_list)>=1:# print("vide_id_list=",vide_id_list)# 投诉视频详情new_vide_list = []if vide_id_list:for vide_id in vide_id_list:t = int(time.time())vide_info_url = f'https://api.ivideo.sina.com.cn/public/video/play?video_id={vide_id}&appver=V11220.191231.02&appname=sinaplayer_pc&applt=web&tags=sinaplayer_pc&player=all&jsonp=&plid=2019090301&prid=&uid=&tid=&pid=1&ran=0.34558379845592846&r=https%3A%2F%2Ftousu.sina.com.cn%2Fcomplaint%2Fview%2F17349160365%2F&referrer=&ssid=gusr_pc_{t}&preload=0&uu=60.180.189.200_1581579174.788519&isAuto=1'# print("vide_info_url=" ,vide_info_url)res = session.get(vide_info_url, verify=False)# result = res.encode('utf-8').decode('unicode_escape')result = json.loads(res.text)# print("result =",type(result))new_vide_list.append(result)if new_vide_list:new_dict['投诉视频详情'] = new_vide_listinfo['投诉详情'] = new_dict# else:#     new_dict['投诉视频详情'] = None#     info['投诉详情'] = new_dictbig_list.append(new_dict)print("big_list==",big_list,len(big_list))except Exception as e:print(e)with open('阿里投诉信息.json', "a+", encoding = 'utf-8') as fw:fw.write(json.dumps(big_list,ensure_ascii=False ) + '\n')if __name__ == '__main__':startTime = time.time()dewu_company()endTime = time.time()print ('Done, Time cost: %s ' % (endTime - startTime))

20页数据爬取时间：Done, Time cost: 20.348562240600586

python3多线程爬取京东投诉信息相关推荐

go爬虫和python爬虫哪个好_python 爬虫实战项目--爬取京东商品信息（价格、优惠、排名、好评率等）-Go语言中文社区...
利用splash爬取京东商品信息一.环境 window7 python3.5 pycharm scrapy scrapy-splash MySQL 二.简介为了体验scrapy-splash 的动 ...
python爬取京东商品信息代码_爬取京东商品信息
利用 BeautifulSoup + Requests 爬取京东商品信息并保存在Excel中一.查看网页信息打开京东商城,随便输入个商品,就选固态硬盘吧先看看 URL 的规律,可以看到我们输入的 ...
layui获取input信息_python爬虫—用selenium爬取京东商品信息
python爬虫--用selenium爬取京东商品信息 1.先附上效果图(我偷懒只爬了4页) 2.京东的网址https://www.jd.com/ 3.我这里是不加载图片,加快爬取速度,也可以用Hea ...
python爬取京东商品属性_python爬虫小项目：爬取京东商品信息
#爬取京东手机信息 import requests from bs4 import BeautifulSoup from selenium import webdriver import re imp ...
Day06,selenium的剩余用法、万能登录破解和爬取京东商品信息，及破解极验滑动验证码...
一.自动登录抽屉新热榜 from selenium import webdriver import timedriver = webdriver.Chrome(r'D:\BaiduNetdiskDow ...
python爬虫爬取京东商品评价_python爬取京东商品信息及评论
''' 爬取京东商品信息: 功能: 通过chromeDrive进行模拟访问需要爬取的京东商品详情页(https://item.jd.com/100003196609.html)并且程序支持多个页面爬取 ...
Python爬取京东书籍信息（包括书籍评论数、简介等）
Python爬取京东书籍信息(包括书籍评论数.简介等) 一. 工具二. 准备爬取的数据三. 内容简述四. 页面分析 (一) 寻找目录中商品所在标签 (二) 寻找页面中能爬取到的所有数据 (三) ...
python爬虫爬商品库存_python爬虫实践——爬取京东商品信息
1 ''' 2 爬取京东商品信息:3 请求url:4 https://www.jd.com/5 提取商品信息:6 1.商品详情页7 2.商品名称8 3.商品价格9 4.评价人数10 5.商品商家11 ...
Selenium介绍--实例爬取京东商品信息与图片
目录一.Selenium简介二.Selenium组成三.Selenium特点四.Selenium的基本使用 1.下载所用浏览器需要的驱动 2.创建项目并导入依赖 3.入门 3.代码演示五.实 ...
爬虫实战：使用Selenium爬取京东宝贝信息
有些页面数据是采用Ajax获取的,但是这些Ajax接口参数比较复杂,可能会加入加密秘钥.对于这种页面,最方便的方法是通过selenium.可以用Selenium来模拟浏览器操作,抓取京东商品信息. 网 ...

python3多线程爬取京东投诉信息

开启线程池示例

一、多线程爬取京东投诉信息

二、多线程爬取阿里详情投诉信息

python3多线程爬取京东投诉信息相关推荐

最新文章

热门文章