2019独角兽企业重金招聘Python工程师标准>>>

import reimport datetime
import time, redis
from bs4 import BeautifulSoup
from lxml import etreeimport requests
from pymongo import MongoClient
from pymysql import connect
from selenium import webdriver
from selenium.webdriver import DesiredCapabilitiesclass ArticleFilter(object):def __init__(self, title, content):self.redis_client = redis.StrictRedis(host='127.0.0。1', port='6379', db=9)self.first_keywords = str(self.redis_client.get('first_keywords')).split(',')self.second_keywords = str(self.redis_client.get('second_keywords')).split(',')self.title = titleself.content = contentself.group_id_list = list()# 一级关键词在内容中的频次def article_content_filter(self):first_keyword_dict = dict()second_keyword_dict = dict()# 内容查找if isinstance(self.content, list):text = ''.join([item.get('text') for item in self.content if item.get('text')])# 查询文章内容含有的频次最高的一级关键词for first_keyword in self.first_keywords:num = 0num += text.count(first_keyword)if num > 0:first_keyword_dict[first_keyword] = numfirst_res = self.select_high(first_keyword_dict)if len(first_res) == 1:keyword, num = first_res[0][0], first_res[0][1]keyword = {'first_keywords': keyword}else:# 频次最高的一级关键词没有或者有多个,采用二级属性词分类标准for second_keyword in self.second_keywords:num = 0num += text.count(second_keyword)if num > 0:second_keyword_dict[second_keyword] = numsecond_res = self.select_high(second_keyword_dict)if len(second_res) == 1:keyword, num = second_res[0][0], second_res[0][1]keyword = {'second_keywords': keyword}elif len(second_res) > 1:# 频次最高的二级属性词有多个,文章分别上架到二级属性词对应的文章分类keyword = [x[0] for x in second_res]keyword = {'second_keywords': keyword}else:# 没有匹配到二级属性词,但频次最高的一级关键词有多个,文章分别上架到一级关键词对应的文章分类if len(first_res) > 1:keyword = [x[0] for x in first_res]keyword = {'first_keywords': keyword}else:return Falsereturn keywordreturn False# 标题查找def article_title_filter(self):first_keyword_dict = dict()for first_keyword in self.first_keywords:num = 0num += self.title.count(first_keyword)if num > 0:first_keyword_dict[first_keyword] = numfirst_res = self.select_high(first_keyword_dict)if len(first_res) == 1:keyword, num = first_res[0][0], first_res[0][1]first_keywords = {'first_keywords': keyword}return first_keywordsreturn False# 关键词查找--主函数,返回文章关键词对应的分类IDdef article_filter(self):# 1.标题查找title_keyword = self.article_title_filter()if title_keyword:first_keywords = title_keyword.get('first_keywords')group_id = self.get_keyword_group_id(first_keywords)self.group_id_list.append(group_id)else:# 2.内容查找content_keyword = self.article_content_filter()if content_keyword:first_keywords = content_keyword.get('first_keywords')if isinstance(first_keywords, str):group_id = self.get_keyword_group_id(first_keywords)self.group_id_list.append(group_id)elif isinstance(first_keywords, list):for first_keyword in first_keywords:group_id = self.get_keyword_group_id(first_keyword)self.group_id_list.append(group_id)else:second_keywords = content_keyword.get('second_keywords')if isinstance(second_keywords, str):group_id = self.get_keyword_group_id(second_keywords)self.group_id_list.append(group_id)elif isinstance(second_keywords, list):for second_keyword in second_keywords:group_id = self.get_keyword_group_id(second_keyword)self.group_id_list.append(group_id)else:self.group_id_list = Noneelse:self.group_id_list = Nonereturn self.group_id_list# 选取出现频次最高的关键字@staticmethoddef select_high(keyword_dict):ls = sorted(list(keyword_dict.items()), key=lambda a: a[1], reverse=True)index = 0for i, x in enumerate(ls):if x[1] == ls[0][1]:index = i + 1else:breakprint((ls[:index]))return ls[:index]# Redis取出关键词对应的文章分类IDdef get_keyword_group_id(self, keyword):article_group_id = self.redis_client.hget('group_id_of_keyword', keyword)return article_group_id# 文章敏感词过滤def sensitive_words_filter(self):try:sensitive_words = self.redis_client.get('sensitive_words')if sensitive_words:sensitive_words = sensitive_words.split(',')text = ''.join([item.get('text') for item in self.content if item.get('text')])for sensitive_word in sensitive_words:resp_title = self.title.find(sensitive_word)resp_content = text.find(sensitive_word)if resp_title != -1 or resp_content != -1:return Trueelse:return Falseelse:return Falseexcept Exception as e:return Falseclass huxiu_spider(object):def __init__(self):self.base_url = 'https://www.huxiu.com/'self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0'}def send_request(self, url):response = requests.get(url, headers=self.headers)text = response.textreturn text# 文章列表def first_analysis(self, text):selector = etree.HTML(text)results = selector.xpath('//*[@id="index"]/div[2]/div[2]/div')# //*[@id="index"]/div[1]/div[2]/div[9]/div[1]/a/div/@stylenew_list = []i = 1for res in results:res_dict = {}web_name = '虎嗅网'res_dict['web_name'] = web_name# 文章标题title = res.xpath('div[1]/h2/a/text()')[0]print('正在爬取第%s篇文章,标题是:%s' % (i, title))num = self.get_title(title, web_name)print('查看文章是否存在=====')if num == 0:print('文章不存在~~~')url = res.xpath('div/h2/a[starts-with(@href, "/article")]/@href')[0]article_link = 'https://www.huxiu.com' + urlarticle_content, article_time = self.second_analysis(article_link)if article_content != 1:print('敏感词开始过滤')# 本地敏感关键词过滤article_filter_obj = ArticleFilter(title, article_content)resp = article_filter_obj.sensitive_words_filter()if resp:print('文章存在敏感词汇')else:# 文章内容res_dict['content'] = article_content# 文章发布时间res_dict['date'] = article_time# 文章内容链接res_dict['article_link'] = article_link# 文章标题res_dict['title'] = title# 文章简介summary = res.xpath('div/div[2]/text()')[0]res_dict['summary'] = summary# 文章作者name = res.xpath('div/div/a/span/text()')[0]res_dict["name"] = name# 文章作者链接# res_dict["author_link"] = 'https://www.huxiu.com' + res.xpath('div/div/a/@href')[0]# 文章列表主图if res.xpath('div/a/img/@data-original'):min_pic = res.xpath('div/a/img/@data-original')[0]oss_url = self.upload_oss(min_pic)# oss_url = oss_url.replace('http', 'https')res_dict["min_pic"] = oss_urlelif res.xpath('a/div/img/@data-original'):min_pic = res.xpath('a/div/img/@data-original')[0]oss_url = self.upload_oss(min_pic)# oss_url = oss_url.replace('http', 'https')res_dict["min_pic"] = oss_urlelif res.xpath('div/a/div/@style'):# 截取图片是视频样式的mystr = res.xpath('div/a/div/@style')[0]print(111, mystr)start_index = mystr.find('(', 0, len(mystr))end_index = mystr.find('?', 0, len(mystr))min_pic = mystr[start_index + 2:end_index]print(123, min_pic)oss_url = self.upload_oss(min_pic)print(321, oss_url)# oss_url = oss_url.replace('http', 'https')res_dict["min_pic"] = oss_urlelse:oss_url = ''res_dict["min_pic"] = oss_urlself.upload_mongo(res_dict)self.upload_mysql(title, name, article_time, oss_url, summary, web_name, article_link)print('成功获取并保存第%s篇文章' % i)i += 1new_list.append(res_dict)else:i += 1continueelse:i += 1continueprint('成功获取到%s篇文章' % (i - 1))# 文章内容def second_analysis(self, url):try:# 自定义PhantomJS的请求头cap = DesiredCapabilities.PHANTOMJS.copy()for key, value in self.headers.items():cap['phantomjs.page.customHeaders.{}'.format(key)] = valuebrowser = webdriver.PhantomJS('/usr/local/lib/phantomjs-2.1.1-linux-x86_64/bin/phantomjs')browser.get(url)time.sleep(3)html = browser.page_source# 选取文章发布时间selector = etree.HTML(html)if selector.xpath('//div[@class="column-link-box"]/span[1]/text()'):article_time = selector.xpath('//div[@class="column-link-box"]/span[1]/text()')[0]print(article_time)# //*[@id="article_content301428"]/p[138]/span[2]/text() ---new# //*[@class="article-author"]/span[2]/text() ---oldelif selector.xpath('//*[@id="article_content301428"]/p[138]/span[2]/text() '):article_time = selector.xpath('//*[@id="article_content301428"]/p[138]/span[2]/text() ')[0]else:article_time = ''# 文章内头图if selector.xpath('//div[@class="article-img-box"]/img/@src'):article_min_pic = selector.xpath('//div[@class="article-img-box"]/img/@src')[0]else:article_min_pic = ""# 选取文章内容content = selector.xpath('//*[@class="article-content-wrap"]')[0]result = etree.tostring(content, method='html')print('获取到文章内容')# 获取bs4对象soup = BeautifulSoup(result, 'html.parser', from_encoding='utf-8')new_list = []# 通过标签来获取内容ls = soup.find_all(["p", "img"])for table in ls:res = {}data = table.get_text()if data:# # 去除空字符和特殊字符new_data = "".join(data.split())new_data = new_data.replace(u'\ufeff', '')if new_data != "":res["text"] = new_datanew_list.append(res)link = table.get('src')if link:oss_url = self.upload_oss(link)res["img"] = oss_urlnew_list.append(res)if article_min_pic != '':article_min_pic = self.upload_oss(article_min_pic)# article_min_pic = article_min_pic.replace('http', 'https')new_list.insert(0, {'img': article_min_pic})browser.quit()return new_list, article_timeexcept Exception as e:print('文章不存在了', e)return 1, 1# 上传图片到ossdef upload_oss(self, url):kw = {'fileurl': url,'filepath': 'gander_goose/dev/test2'}result = requests.post(url='http://api.max-digital.cn/Api/oss/uploadByUrl', data=kw)result = result.json()oss_url = result.get('oss_file_url')oss_url = oss_url.replace('maxpr.oss-cn-shanghai.aliyuncs.com', 'cdn.max-digital.cn')oss_url = oss_url.replace('http', 'https')return oss_url# 数据上传mongodef upload_mongo(self, article_dict):try:client = MongoClient('127.0.0.1', 27017)my_db = client.wechatmy_db.articles.insert_one(article_dict)print('上传到mongo成功')except Exception as e:print('上传到mongo失败:', e)# 插入到mysqldef upload_mysql(self, title, name, date, oss_url, summary, web_name, link):try:# 上传mysql# 创建Connection连接conn = connect(host='localhost', port=3306, database='wechat',user='root', password='mysql', charset='utf8')# 获得Cursor对象cs1 = conn.cursor()# 执行insert语句,并返回受影响的行数:添加一条数据# 增加now = datetime.datetime.now()imgurl = "https://cdn.max-digital.cn/gander_goose/dev/test2/15368082362561.jpg"sql1 = "insert into article_info (title,author,wechat_art_date,min_pic,summary,web_name,is_show,is_big,link,round_head_img,create_time) values ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')" % (title, name, date, oss_url, summary, web_name, 0, 0, link, imgurl, now)cs1.execute(sql1)# 获取最新插入的文章的IDnew_article_id = int(conn.insert_id())# 修改分类--24小时下文章的自定义排序值# sql2 = 'update article_group set sort_num = sort_num + 1 where group_id=1'# cs1.execute(sql2)# 上线到24小时分类sql3 = 'insert into article_group (article_id,group_id,sort_num,create_time) values ("%s", "%s", "%s", "%s")' % (new_article_id, 1, 1, now)cs1.execute(sql3)# 修改文章上线状态sql4 = "update article_info set is_show = 1, zj_art_date='%s' where id='%s'" % (now, new_article_id)cs1.execute(sql4)conn.commit()cs1.close()conn.close()print('上传到mysql成功')except Exception as e:print('mysql上传失败:', e)def get_title(self, title, query):# 查询mysqlconn = connect(host='127.0.0.1', port=3306,database='zj',user='root', password='mysql', charset='utf8')# 获得Cursor对象cs1 = conn.cursor()res = 'select * from article_info where title = "%s" and web_name = "%s" ' % (title, query)num = cs1.execute(res)return numdef run(self):text = self.send_request(self.base_url)self.first_analysis(text)if __name__ == '__main__':huxiu = huxiu_spider()while True:start_time = time.time()print('开始时间:', time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(start_time)))huxiu.run()time.sleep(3600)

转载于:https://my.oschina.net/u/3892643/blog/3055002

requests爬虎妞相关推荐

  1. python requests 爬取数据

    python requests 爬取数据 import requests from lxml import etree import time import pymysql import json h ...

  2. Crawl:利用bs4和requests爬取了国内顶级某房源平台(2020年7月2日上海二手房)将近30*100多条数据并进行房价分析以及预测

    Crawl:利用bs4和requests爬取了国内顶级某房源平台(2020年7月2日上海二手房)将近30*100多条数据并进行房价分析以及预测 目录 利用bs4和requests爬取了国内顶级某房源平 ...

  3. day02:requests爬取豆瓣电影信息

    一.requests爬取豆瓣电影信息 -请求url: https://movie.douban.com/top250 -请求方式: get -请求头: user-agent cookies二.sele ...

  4. python3.x+requests 爬取网站遇到中文乱码的解决方案

    正常情况下,遇见问题上google找答案能甩百度100条街,但是这个问题是个例外······人家老外就没有乱码的问题.言归正传,首先建议大家看一下python3.x+requests 爬取网站遇到中文 ...

  5. python爬取百度搜索_使用Python + requests爬取百度搜索页面

    想学一下怎样用python爬取百度搜索页面,因为是第一次接触爬虫,遇到一些问题,把解决过程与大家分享一下 1.使用requests爬取网页 首先爬取百度主页www.baidu.com import r ...

  6. 使用requests爬取实习僧网站数据

    任务要求: 爬取实习僧网站的招聘公司信息和职位信息,并存储到数据库中,对应的数据库表和需要爬取的字段见下面表一和表二(注意:爬取存在的字段) 代码以上传带github上:使用requests爬取实习僧 ...

  7. requests爬取IT橘子

    requests爬取IT橘子 import requests import jsondef get_html(url):headers = {"User-Agent": " ...

  8. 【爬虫案例】Requests爬取豆瓣短评以及入门爬虫注意事项

    一.Requests是什么? Requests 是一个 Python 的 HTTP 客户端库. 支持的 HTTP 特性: 保持活动和连接池.国际域名和URL.Cookie持久性会话 浏览器式SSL验证 ...

  9. 【爬虫】使用requests爬取英雄联盟英雄皮肤

    使用requests爬取英雄联盟英雄皮肤 自己做的 import requestsresponse = requests.get("https://game.gtimg.cn/images/ ...

最新文章

  1. python php区别-JAVA、PHP、Python这3种编程语言有什么区别?
  2. 自己写的python软件可以在哪发布-如何发布一个Python命令行工具
  3. 编码练习——Java-7-集合类
  4. 重磅!神策数据游戏行业解决方案全面上线,速来围观
  5. A*寻路算法所生成的路径
  6. 对接接口文档_产品经理所理解的接口
  7. UILabel「行距,首行缩进」
  8. PKU3259-Wormholes(SPFA判断负环,含题意)
  9. Redis数据类型及其命令
  10. mysql中chr_MySQL常见函数
  11. mvc4开篇之BundleConfig(1)
  12. 2016版excel_【重磅分享】最完整EXCEL教程,视频+PPT下载
  13. php45欧米伽_PHP中的数字格式5.5到5.50 - php
  14. 小学生计算机编程题,真题|小学组倒数第二道编程题,做不出来罚你点赞三遍!...
  15. sequencer和driver
  16. 转换xml格式的短信记录
  17. 如何使用bert做word embedding
  18. Unity Substance材质
  19. Python破解百度翻译反爬机制---自制翻译器
  20. 数字从后往前每三位加个逗号

热门文章

  1. Windows下查找网络硬盘、断开网络硬盘
  2. 极光大数据:2017年10月主流共享单车app运营报告(附下载)
  3. 产业园区实现产业集聚的三大举措
  4. 【完美解决】Win10 创建镜像卷错误:存放给定卷的扩展的所有磁盘必须有相同的扇区大小,且扇区大小必须有效
  5. 北京大学软件工程第二学位!全日制国家支持,不读研也有出路!
  6. 计算机三级网络技术知识点大纲,全国计算机等级三级网络技术考试大纲
  7. 实体组件系统_实体组件系统介绍
  8. 访问windows server 2022共享文件夹提示引用 的账户当前已锁定且无法登录的解决
  9. cpu与外设接口,cpu时序控制、电源时序控制(电源IC控制)
  10. 敬业签安卓手机便签文本中英文翻译功能如何使用?