python批量下载公众号历史文章（一）

【新地址】

Rabbit & Bear【半自动版】python批量下载公众号历史文章（一） | Rabbit & Bear (unclevicky.github.io)

【背景】

原博客是2019年9月写的，中间不知道什么时候莫名被吞了，再发布，同样的内容就再也审核不通过了，所以就重新写了一遍，详细内容见新地址。

【运行效果】

【完整代码】

# _*_ coding:utf-8 _*_
import os,sys
import requests
import json
import subprocess
import re
import random
import time
from bs4 import BeautifulSoup
from datetime import datetime,timedelta
from time import sleepclass ArticleInfo():def __init__(self,url,title,idx_num,atc_datetime): #idx_num是为了方便保存图片命名self.url = urlself.title = titleself.idx_num = idx_numself.atc_datetime = atc_datetimedef read_file(file_path):with open(file_path,"r",encoding="utf-8") as f:file_content = f.read()return file_contentdef save_file(file_path,file_content):with open(file_path,"w",encoding="utf-8") as f:f.write(file_content)def get_html(url):headers = {"Accept": "*/*","User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36 QBCore/4.0.1219.400 QQBrowser/9.0.2524.400 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 MicroMessenger/6.5.2.501 NetType/WIFI WindowsWechat","Accept-Encoding": "gzip, deflate","Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.5;q=0.4",'Connection':'keep-alive'}response = requests.get(url,headers = headers,proxies=None)if response.status_code == 200:htmltxt = response.text #返回的网页正文return htmltxtelse:return Nonedef get_save_image(url,img_file_path):headers = {"Accept": "*/*","User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36 QBCore/4.0.1219.400 QQBrowser/9.0.2524.400 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 MicroMessenger/6.5.2.501 NetType/WIFI WindowsWechat","Accept-Encoding": "gzip, deflate","Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.5;q=0.4",'Connection':'keep-alive'}response = requests.get(url,headers = headers,proxies=None)with open(img_file_path,"wb") as f:f.write(response.content)def get_article_list(json_path):"""通过抓取的包的json文件，获取所有文章的信息的列表"""file_list = os.listdir(json_path) #jsonpath是fiddler导出的文件夹路径article_list = [] # 用来保存所有文章的列表for file in file_list:file_path = os.path.join(json_path,file)file_cont = read_file(file_path)json_cont = json.loads(file_cont)general_msg_list = json_cont['general_msg_list']json_list = json.loads(general_msg_list)#print(json_list['list'][0]['comm_msg_info']['datetime'])for lst in json_list['list']:atc_idx = 0 # 每个时间可以发多篇文章 为了方便后续图片命名seconds_datetime = lst['comm_msg_info']['datetime']atc_datetime = seconds_to_time(seconds_datetime)if lst['comm_msg_info']['type'] == 49: # 49为普通的图文atc_idx+=1url = lst['app_msg_ext_info']['content_url']title = lst['app_msg_ext_info']['title']atc_info = ArticleInfo(url,title,atc_idx,atc_datetime)article_list.append(atc_info)if 1 == lst['app_msg_ext_info']['is_multi']: # 一次发多篇multi_app_msg_item_list = lst['app_msg_ext_info']['multi_app_msg_item_list']for multi in multi_app_msg_item_list:atc_idx+=1url = multi['content_url']title = multi['title']mul_act_info = ArticleInfo(url,title,atc_idx,atc_datetime)article_list.append(mul_act_info)return article_listdef chg_img_link(bs_html):link_list = bs_html.findAll("link")for link in link_list:href = link.attrs["href"]if href.startswith("//"):new_href = "http:"+hreflink.attrs["href"]=new_hrefdef rep_image(org_html,local_img_path,html_name):bs_html = BeautifulSoup(org_html,"lxml")img_list = bs_html.findAll("img")img_idx = 0 # 计数和命名用for img in img_list:img_idx+=1org_url = "" # 图片的真实地址if "data-src" in img.attrs: # <img  data-src="..."org_url = img.attrs['data-src']elif "src" in img.attrs : # <img  src="..."org_url = img.attrs['src']if org_url.startswith("//"):org_url = "http:" + org_urlif len(org_url) > 0 :print("download image ",img_idx)if "data-type" in img.attrs:img_type = img.attrs["data-type"]else:img_type = "png"img_name = html_name + "_" + str(img_idx) + "." +img_typeimg_file_path = os.path.join(local_img_path,img_name)get_save_image(org_url,img_file_path) # 下载并保存图片img.attrs["src"] = "images/" + img_nameelse:img.attrs["src"] = ""chg_img_link(bs_html)return str(bs_html)def down_html(json_path,html_path):if not os.path.exists(html_path):os.makedirs(html_path) # 创建保存html文件的文件夹local_img_path = os.path.join(html_path,"images")if not os.path.lexists(local_img_path):os.makedirs(local_img_path) # 创建保存本地图片的文件夹article_list = get_article_list(json_path)article_list.sort(key=lambda x:x.atc_datetime, reverse=True) # 根据文章发表时间倒序排列tot_article = len(article_list) # 文章的总数量i = 0 #计数用for atc in article_list:i+=1atc_unique_name = str(atc.atc_datetime) + "_" + str(atc.idx_num) # 时间+序号 作为同一时间发表的文章的唯一标识html_name = atc_unique_name+".html"html_file_path = os.path.join(html_path,html_name)print(i,"of",tot_article,atc_unique_name,atc.title)if os.path.exists(html_file_path): # 支持续传print("{} existed already!".format(html_file_path))continueorg_atc_html = get_html(atc.url)new_atc_html = rep_image(org_atc_html,local_img_path,html_name)save_file(html_file_path,new_atc_html)sleep(round(random.uniform(1,3),2))"""for testif i>0 :break"""def conv_html_pdf(html_path,pdf_path):if not os.path.exists(pdf_path):os.makedirs(pdf_path)f_list = os.listdir(html_path)for f in f_list:if (not f[-5:]==".html") or ("tmp" in f): #不是html文件的不转换，含有tmp的不转换continuehtml_file_path = os.path.join(html_path,f)html_tmp_file = html_file_path[:-5]+"_tmp.html" #生成临时文件，供转pdf用html_str = read_file(html_file_path)bs_html = BeautifulSoup(html_str,"lxml")pdf_title = ""title_tag = bs_html.find(id="activity-name")if title_tag is not None:pdf_title = "_"+title_tag.get_text().replace(" ", "").replace("  ","").replace("\n","")print(pdf_title)r_idx = html_file_path.rindex("/") + 1pdf_name = html_file_path[r_idx:-5]+pdf_titlepdf_file_path = os.path.join(pdf_path,pdf_name+".pdf")"""加快转换速度，把临时文件中的不必要的元素去掉"""[s.extract() for s in bs_html(["script","iframe","link"])]save_file(html_tmp_file,str(bs_html))call_wkhtmltopdf(html_tmp_file,pdf_file_path)def call_wkhtmltopdf(html_file_path,pdf_file_path,skipExists=True,removehtml=True):if skipExists and os.path.exists(pdf_file_path):print("pdf_file_path already existed!")if removehtml :os.remove(html_file_path)returnexe_path = cfg['wkhtmltopdf'] #wkhtmltopdf.exe的保存路径cmd_list = []cmd_list.append(" --load-error-handling ignore ")cmd_list.append(" "+ html_file_path +" ")cmd_list.append(" "+ pdf_file_path +" ")cmd_str = exe_path + "".join(cmd_list)print(cmd_str)subprocess.check_call(cmd_str, shell=False)if removehtml:os.remove(html_file_path)def get_config():cfg_file = read_file("config/wechat.cfg")cfg_file = cfg_file.replace("\\\\","/").replace("\\","/") #防止json中有 / 导致无法识别cfg_json = json.loads(cfg_file)return cfg_jsondef seconds_to_time(seconds):taime_array = time.localtime(seconds) # 1970-01-01 00:00:00 到发表时的秒数other_style_time = time.strftime("%Y-%m-%d %H:%M:%S", taime_array)date_time =datetime.strptime(other_style_time, "%Y-%m-%d %H:%M:%S")return str(date_time).replace("-","").replace(":","").replace(" ","")cfg = get_config() # 获得配置文件的全局变量
#get_article_list("./tmp/") # for test
#down_html("./tmp/","./html/")# for testif __name__ == "__main__":if len(sys.argv) == 1:arg = Noneelse:arg = sys.argv[1]if arg is None or arg == "html":down_html(cfg['jsonDir'],cfg['htmlDir'])elif arg == "pdf":conv_html_pdf(cfg['htmlDir'],cfg['pdfDir'])

python批量下载公众号历史文章（一）相关推荐

python爬取公众号历史文章_pythons爬虫：抓取微信公众号历史文章(selenium+phantomjs)...
原标题:pythons爬虫:抓取微信公众号历史文章(selenium+phantomjs) 大数据挖掘DT数据分析公众号: datadw 本文爬虫代码可以通过回复本公众号关键字"公众号& ...
python爱好者社区公众号历史文章合集_如何优雅的抓取微信公众号历史文章
这是几天前在公众号上发的文章,主要讨论现在微信公众号文章抓取的一般思路以及优缺点,我不会讲技术细节,但我会分享别人已经开源的项目,你可以参考代码开了解其中的细节. 背景微信公众号历史记录只可以通过客 ...
python爱好者社区公众号历史文章合集_GitHub - acherie/weixin_crawler: 高效微信公众号历史文章和阅读数据爬虫powered by scrapy...
What is weixin_crawler? weixin_crawler是一款使用Scrapy.Flask.Echarts.Elasticsearch等实现的微信公众号文章爬虫,自带分析报告和全文 ...
python 下载公众号文章_python3下载公众号历史文章
最近担心一直关注的公众号的文章突然失踪,心心念念的文章一定要拽住自己手中才好.于是找了各种办法,其实网上有代理采集的,想想为了长久之计还是自己研究一个办法出来. 于是百度谷歌必应各种搜索引擎上手,第一 ...
python爬取公众号历史文章_python3 scrapy爬取微信公众号及历史信息V1.0
妹子图.png 环境:python3 scrapy 目的写这篇文章主要是做一下纪念,毕竟是搞了快两天的东西了,今天加大了量,使用scrapy爬取100多个微信公众号,然后出现IP被封的情况下,当然 ...
python爬取公众号历史文章
文章来源学习网上以及自己修改. 参考链接: https://blog.csdn.net/d1240673769/article/details/75907152 目的 ''' 爬取公众号的历史文章信 ...
python爱好者社区公众号历史文章合集_精心整理 |人工智能爱好者社区历史文章合集（作者篇）...
公众号创办两年来,感谢小伙伴们长期的支持和关注,也感谢各位投稿老师带给我们的好作品,支撑起了我们这个社区,让更多人工智能爱好者和从业者获得更棒的知识给养.六月已经过去了,在这年中之际,小编选取在人工智 ...
python爬取公众号历史文章_Python爬虫爬取微信公众号历史文章全部链接
因为朋友问我能不能找一下一个微信公众号的全部历史文章的链接,我就帮他弄了一下,通过百度和谷歌发现现在大家爬微信公众号的思路基本都是下面两种: 通过搜狗搜索微信公众号然后拿到链接通过fiddler检测 ...
python爬取公众号历史文章_python爬微信公众号前10篇历史文章（6）-话说http cookies...
早期Web开发面临的最大问题之一是如何管理状态.简言之,服务器端没有办法知道两个请求是否来自于同一个浏览器.这是cookies的起源. 什么是cookie? A cookie is a small s ...

python批量下载公众号历史文章（一）

python批量下载公众号历史文章（一）相关推荐

最新文章

热门文章