爬取内容:

爬取结果:


爬取代码:

'''
大众点评-黄鹤楼评论:
①需要更改txt文本储存地址;
②更换配置中的cookie (两个cookie都一样);
③更换自己需要爬取的店铺id;
'''import datetime
import random
import time
import re
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
import pymongo
import requests
from pyquery import PyQuery as pqclient = pymongo.MongoClient('localhost', 27017)
shidai = client['gongyuan']
comments = shidai['comments']path_one = r'C:\Users\FREEDOM\AppData\Local\Google\Chrome\Application\chromedriver.exe'COOKIES = 'fspop=test; cy=2290; cye=fengdu; _lx_utm=utm_source%3Dbing%26utm_medium%3Dorganic; _lxsdk_cuid=17362020194c8-0168451bb30b3f-7373e61-100200-1736202019ac8; _lxsdk=17362020194c8-0168451bb30b3f-7373e61-100200-1736202019ac8; _hc.v=03f94fd1-8e5a-b655-34ac-d35c976ad077.1595077168; Hm_lvt_602b80cf8079ae6591966cc70a3940e7=1595077167,1595205106; dper=fc42d25554a5cc6d072d66a47fa1bb1bce70b5db0708178d7745bdb59c867844bf8159e84c173786e9c039f84c74b4348dfff6405a63d466f0e745c92a88ceda5cbdb18c59f045a355c216ded25b37af99704ed99f5d17386fa65e24ad618889; ll=7fd06e815b796be3df069dec7836c3df; ua=%E5%A0%85%E5%BC%B7%E7%9A%84%E6%B3%A1%E6%B2%AB; ctu=54cab1a287e14fb7ab7bde1e708a3f9c58b9ab2f36c6812a7371265a19fa5245; uamo=15696741615; s_ViewType=10; dplet=47c9fa5fe4e8e62a58cebe0d34c549e9; Hm_lpvt_602b80cf8079ae6591966cc70a3940e7=1595206647; _lxsdk_s=1736a82d671-1c-174-77b%7C%7C1'f = open('C:\\Users\\lhq\\Desktop\\store_shop.txt', 'w',encoding='utf-8')class DianpingComment:font_size = 14start_y = 23ii=0def __init__(self, shop_id, cookies, delay=7, handle_ban=True, comments=comments):self.shop_id = shop_idself._delay = delayself.num = 1self.db = commentsself._cookies = self._format_cookies(cookies)self._css_headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36',}self._default_headers = {'Connection': 'keep-alive','Host': 'www.dianping.com','User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36','Cookie':'fspop=test; cy=2290; cye=fengdu; _lx_utm=utm_source%3Dbing%26utm_medium%3Dorganic; _lxsdk_cuid=17362020194c8-0168451bb30b3f-7373e61-100200-1736202019ac8; _lxsdk=17362020194c8-0168451bb30b3f-7373e61-100200-1736202019ac8; _hc.v=03f94fd1-8e5a-b655-34ac-d35c976ad077.1595077168; Hm_lvt_602b80cf8079ae6591966cc70a3940e7=1595077167,1595205106; dper=fc42d25554a5cc6d072d66a47fa1bb1bce70b5db0708178d7745bdb59c867844bf8159e84c173786e9c039f84c74b4348dfff6405a63d466f0e745c92a88ceda5cbdb18c59f045a355c216ded25b37af99704ed99f5d17386fa65e24ad618889; ll=7fd06e815b796be3df069dec7836c3df; ua=%E5%A0%85%E5%BC%B7%E7%9A%84%E6%B3%A1%E6%B2%AB; ctu=54cab1a287e14fb7ab7bde1e708a3f9c58b9ab2f36c6812a7371265a19fa5245; uamo=15696741615; s_ViewType=10; dplet=47c9fa5fe4e8e62a58cebe0d34c549e9; Hm_lpvt_602b80cf8079ae6591966cc70a3940e7=1595206647; _lxsdk_s=1736a82d671-1c-174-77b%7C%7C1'}self._cur_request_url = 'http://www.dianping.com/shop/{}/review_all'.format(self.shop_id)self.sub_url = 'http://www.dianping.com'def run(self):self._css_link = self._get_css_link(self._cur_request_url)self._font_dict = self._get_font_dict(self._css_link)self._get_conment_page()def _delay_func(self):delay_time = random.randint((self._delay - 2) * 10, (self._delay + 2) * 10) * 0.1time.sleep(delay_time)def _init_browser(self):"""初始化游览器"""chrome_options = Options()chrome_options.add_argument('--headless')chrome_options.add_argument('--disable-gpu')browser = webdriver.Chrome(chrome_options=chrome_options, executable_path=path_one)browser.get(self._cur_request_url)for name, value in self._cookies.items():browser.add_cookie({'name': name, 'value': value})browser.refresh()return browserdef _handle_ban(self):"""爬取速度过快,出现异常时处理验证"""try:self._browser.refresh()time.sleep(1)button = self._browser.find_element_by_id('yodaBox')move_x_offset = self._browser.find_element_by_id('yodaBoxWrapper').size['width']webdriver.ActionChains(self._browser).drag_and_drop_by_offset(button, move_x_offset, 0).perform()except:passdef _format_cookies(self, cookies):'''获取cookies;;;:param cookies::return:'''cookies = {cookie.split('=')[0]: cookie.split('=')[1]for cookie in cookies.replace(' ', '').split(';')}return cookiesdef _get_conment_page(self):"""请求评论页,并将<span></span>样式替换成文字;"""while self._cur_request_url:f.write("\n\n-----------------------------------------"+self._cur_request_url+"-------------------------------------------\n\n")self._delay_func()print('[{now_time}] {msg}'.format(now_time=datetime.datetime.now(), msg=self._cur_request_url))res = requests.get(self._cur_request_url, headers=self._default_headers, cookies=self._cookies)while res.status_code != 200:cookie = random.choice(COOKIES)cookies = self._format_cookies(cookie)res = requests.get(self._cur_request_url, headers=self._default_headers, cookies=cookies)if res.status_code == 200:breakhtml = res.textclass_set = []for span in re.findall(r'<svgmtsi class="([a-zA-Z0-9]{5,6})"></svgmtsi>', html):class_set.append(span)for class_name in class_set:try:html = re.sub('<svgmtsi class="%s"></svgmtsi>' % class_name, self._font_dict[class_name], html)#print('{}已替换完毕_______________________________'.format(self._font_dict[class_name]))except:html = re.sub('<svgmtsi class="%s"></svgmtsi>' % class_name, '', html)print('替换失败…………………………………………………………………………&&&&&&&&&&&&&&&&&&&&&&&&')doc = pq(html)# print(doc('.NextPage').attr('href'))self._parse_comment_page(html)if doc('.NextPage').attr('href'):self._default_headers['Referer'] = self._cur_request_urlnext_page_url1 = doc('.NextPage').attr('href')next_page_url = self.sub_url + str(next_page_url1)print('next_url:{}'.format(next_page_url))else:next_page_url = Noneprint('next_page_url:{}'.format(next_page_url))self._cur_request_url = next_page_url    #评论页循环,实现翻页功能def _data_pipeline(self, data):"""处理数据"""print(data)def _parse_comment_page(self, html):"""解析评论页并提取数据,把数据写入文件中;;"""doc = pq(html)for li in doc('div.review-list-main > div.reviews-wrapper > div.reviews-items > ul > li'):doc_text = pq(li)if doc_text('.dper-info .name').text():name = doc_text('.dper-info .name').text()else:name = Nonetry:star = doc_text('.review-rank .sml-rank-stars').attr('class')except IndexError:star = Noneif doc_text('div.misc-info.clearfix > .time').text():date_time = doc_text('div.misc-info.clearfix > .time').text()else:date_time = Noneif doc_text('.main-review .review-words').text():comment = doc_text('.main-review .review-words').text()else:comment = Nonedata = {'name': name,'date_time': date_time,'star': star,'comment': comment}#将data中的内容写入到TXT文档中f.write(str(data))f.write("\n")def _get_css_link(self, url):"""请求评论首页,获取css样式文件"""try:print(url)res = requests.get(url, headers=self._default_headers, cookies=self._cookies)html = res.textcss_link = re.search(r'<link re.*?css.*?href="(.*?svgtextcss.*?)">', html)print(css_link)assert css_linkcss_link = 'http:' + css_link[1]return css_linkexcept:Nonedef _get_font_dict(self, url):"""获取css样式对应文字的字典"""res = requests.get(url, headers=self._css_headers)html = res.textbackground_image_link = re.findall(r'background-image:.*?\((.*?svg)\)', html)print(background_image_link)background_image_link_list = []for i in background_image_link:url = 'http:' + ibackground_image_link_list.append(url)print(background_image_link_list)html = re.sub(r'span.*?\}', '', html)group_offset_list = re.findall(r'\.([a-zA-Z0-9]{5,6}).*?round:(.*?)px (.*?)px;', html)'''多个偏移字典,合并在一起;;;'''font_dict_by_offset_list = {}for i in background_image_link_list:font_dict_by_offset_list.update(self._get_font_dict_by_offset(i))font_dict_by_offset = font_dict_by_offset_listprint(font_dict_by_offset)font_dict = {}for class_name, x_offset, y_offset in group_offset_list:x_offset = x_offset.replace('.0', '')y_offset = y_offset.replace('.0', '')try:font_dict[class_name] = font_dict_by_offset[int(y_offset)][int(x_offset)]except:font_dict[class_name] = ''return font_dictdef _get_font_dict_by_offset(self, url):"""获取坐标偏移的文字字典, 会有最少两种形式的svg文件(目前只遇到两种)"""res = requests.get(url, headers=self._css_headers)html = res.textfont_dict = {}y_list = re.findall(r'd="M0 (\d+?) ', html)if y_list:font_list = re.findall(r'<textPath .*?>(.*?)<', html)for i, string in enumerate(font_list):y_offset = self.start_y - int(y_list[i])sub_font_dict = {}for j, font in enumerate(string):x_offset = -j * self.font_sizesub_font_dict[x_offset] = fontfont_dict[y_offset] = sub_font_dictelse:font_list = re.findall(r'<text.*?y="(.*?)">(.*?)<', html)for y, string in font_list:y_offset = self.start_y - int(y)sub_font_dict = {}for j, font in enumerate(string):x_offset = -j * self.font_sizesub_font_dict[x_offset] = fontfont_dict[y_offset] = sub_font_dictreturn font_dictclass Customer(DianpingComment):def _data_pipeline(self, data):print(data)if __name__ == "__main__":dianping = Customer('H2vLcBTsDA6wSxY4', cookies=COOKIES)dianping.run()f.close()

删除TXT文档中部分字段:

file=open("C:\\Users\\lhq\\Desktop\\九峰森林动物园-大众点评.txt",'r', encoding='utf-8-sig')
file_target=open("C:\\Users\\lhq\\Desktop\\九峰森林动物园.txt",'w', encoding='utf-8-sig')
list=file.readlines()for item in list:# print(item)  # 输出列表中的每一个元素# print(type(item))if item.strip()!='' and item[0]!='-':# print(type(item))dict=eval(item)  # 将字符串转成字典# print(type(dict))del dict['name']  # 删除键是'Name'的条目# print(dict)time=dict['date_time']  # time为字符串time_real=time.split()# print(time_real)  # 对字符串按空格进行分割dict['date_time']=time_real[0]  # 修改date_time的值仅为 年-月-日 的形式# print(dict['date_time'])file_target.write(str(dict))  # 写入到目标文件中file_target.write('\n')file_target.close()
file.close()

Python爬取大众点评景点评论相关推荐

  1. python爬取大众点评数据

    python爬取大众点评数据 参考博客: python+requests+beautifulsoup爬取大众点评评论信息 大众点评评论抓取 Chrome如何获得网页的Cookies 如何查看自己访问网 ...

  2. python爬取大众点评数据_python爬虫实例详细介绍之爬取大众点评的数据

    python 爬虫实例详细介绍之爬取大众点评的数据 一. Python作为一种语法简洁.面向对象的解释性语言,其便捷性.容易上手性受到众多程序员的青睐,基于python的包也越来越多,使得python ...

  3. python爬取大众点评_浅谈python爬取58同城,大众点评这两个网站

    1.爬取58同城租房网遇到的坑:我爬了一页数据被封了ip,此时的我是非常的不爽,才十几条数据就封我,于是我就想着一定得找ip代理来解决这个问题,后面就写了个ip代理去弄,另外把之前头部信息ua改成了u ...

  4. python爬取大众点评评论_python爬虫抓取数据 小试Python——爬虫抓取大众点评上的数据 - 电脑常识 - 服务器之家...

    python爬虫抓取数据 小试Python--爬虫抓取大众点评上的数据 发布时间:2017-04-07

  5. python爬取大众点评(破解加密--css加密)

    1.分析 接下来再看一种加密,css加密,以大众点评为例. 访问网址https://www.dianping.com/,搜索关键词比如洗浴,得到如下: 这里的url复制出来之后是这样的:https:/ ...

  6. python爬取大众点评_Python 爬取大众点评 50 页数据,最好吃的成都火锅竟是它!...

    前言 文的文字及图片来源于网络,仅供学习.交流使用,不具有任何商业用途,版权归原作者所有,如有问题请及时联系我们以作处理. 作者: 胡萝卜酱 PS:如有需要Python学习资料的小伙伴可以加点击下方链 ...

  7. 用Python爬取大众点评数据,推荐火锅店里最受欢迎的食品

    前言 文的文字及图片来源于网络,仅供学习.交流使用,不具有任何商业用途,版权归原作者所有,如有问题请及时联系我们以作处理. 作者:有趣的Python PS:如有需要Python学习资料的小伙伴可以加点 ...

  8. python爬取大众点评_Python爬虫,获取大众点评上海地区的餐饮信息!

    需求:获取大众点评官网上上海地区的全部餐饮信息(店名,商圈,星级,评论数,人均数,口味评分,环境评分,服务评分,地址,电话等信息) 分析:餐饮种类分为18种,每一种下面又有子分类! 接着我们拿一个种类 ...

  9. python爬取大众点评数据_Python 爬取大众点评 50 页数据,最好吃的成都火锅竟是它!...

    作者 | 胡萝卜酱 责编 | 伍杏玲 成都到处都是火锅店,有名的店,稍微去晚一点,排队都要排好久,没听说的店,又怕味道不好. 那么如何选择火锅店呢?最简单的肯定是在美团.大众点评上找一找啊.所以,本文 ...

最新文章

  1. rust(54)-字符串
  2. 怎么创建数据表的实体类和业务类_怎样创建一个网站?
  3. java ----一个函数传回多个值的总结
  4. T430s BIOS白名单破解
  5. 51单片机c语言数组怎么用,51单片机之C语言-4.2数组
  6. 区间对比_预算10-15万元区间 国内在售街车综合实力对比
  7. C#篇-值类型和引用类型
  8. 15亿参数的NLP模型究竟有多强大?有人用它生成了一部《哈利·波特》
  9. 小书MybatisPlus第9篇-常用字段默认值自动填充
  10. 使用LPC1768实现的数字时钟和温度计
  11. malloc()动态分配内存
  12. 上网日志留存_日志留存系统
  13. 南卡A2降噪耳机开箱测评:降噪实力派
  14. 沟通技巧-《说话的力量》书中的精髓:如何说话更有力量
  15. Latex写个人简历---程序员的简历
  16. 21、Java——超市会员管理系统(对象+集合)
  17. 机械设计制造及其自动化
  18. 为什么java中的int类型范围是-2的31次方到2的31次方减一?
  19. Dialog常用对话框
  20. android7 华为p9,华为手机全面升级安卓7.0 华为P9首先完成升级EMUI 5.0

热门文章

  1. Drools规则引擎之常用语法
  2. 《南极测绘地理信息应用服务关键技术研究》在西安通过了专家组的验收
  3. 【java 操作mysql】java连接mysql数据库并查询数据
  4. 在其他电脑远程访问svn
  5. 【工业机器人】两分钟读懂工业机器人的设计过程
  6. 读《谨言慎行的力量 - 向南怀瑾学律己》
  7. ant脚本编译java_1.4 在Ant中编译Java代码
  8. 浙江大学PAT解题集7-5输出倒三角形
  9. 登录授权方案:JSON Web Tokens (JWT)
  10. 计算机桌面上的照片转pdf免费,如何把图片转化为pdf,图片转换pdf工具推荐