requests爬虎妞

2019独角兽企业重金招聘Python工程师标准>>>

import reimport datetime
import time, redis
from bs4 import BeautifulSoup
from lxml import etreeimport requests
from pymongo import MongoClient
from pymysql import connect
from selenium import webdriver
from selenium.webdriver import DesiredCapabilitiesclass ArticleFilter(object):def __init__(self, title, content):self.redis_client = redis.StrictRedis(host='127.0.0。1', port='6379', db=9)self.first_keywords = str(self.redis_client.get('first_keywords')).split(',')self.second_keywords = str(self.redis_client.get('second_keywords')).split(',')self.title = titleself.content = contentself.group_id_list = list()# 一级关键词在内容中的频次def article_content_filter(self):first_keyword_dict = dict()second_keyword_dict = dict()# 内容查找if isinstance(self.content, list):text = ''.join([item.get('text') for item in self.content if item.get('text')])# 查询文章内容含有的频次最高的一级关键词for first_keyword in self.first_keywords:num = 0num += text.count(first_keyword)if num > 0:first_keyword_dict[first_keyword] = numfirst_res = self.select_high(first_keyword_dict)if len(first_res) == 1:keyword, num = first_res[0][0], first_res[0][1]keyword = {'first_keywords': keyword}else:# 频次最高的一级关键词没有或者有多个,采用二级属性词分类标准for second_keyword in self.second_keywords:num = 0num += text.count(second_keyword)if num > 0:second_keyword_dict[second_keyword] = numsecond_res = self.select_high(second_keyword_dict)if len(second_res) == 1:keyword, num = second_res[0][0], second_res[0][1]keyword = {'second_keywords': keyword}elif len(second_res) > 1:# 频次最高的二级属性词有多个,文章分别上架到二级属性词对应的文章分类keyword = [x[0] for x in second_res]keyword = {'second_keywords': keyword}else:# 没有匹配到二级属性词,但频次最高的一级关键词有多个,文章分别上架到一级关键词对应的文章分类if len(first_res) > 1:keyword = [x[0] for x in first_res]keyword = {'first_keywords': keyword}else:return Falsereturn keywordreturn False# 标题查找def article_title_filter(self):first_keyword_dict = dict()for first_keyword in self.first_keywords:num = 0num += self.title.count(first_keyword)if num > 0:first_keyword_dict[first_keyword] = numfirst_res = self.select_high(first_keyword_dict)if len(first_res) == 1:keyword, num = first_res[0][0], first_res[0][1]first_keywords = {'first_keywords': keyword}return first_keywordsreturn False# 关键词查找--主函数,返回文章关键词对应的分类IDdef article_filter(self):# 1.标题查找title_keyword = self.article_title_filter()if title_keyword:first_keywords = title_keyword.get('first_keywords')group_id = self.get_keyword_group_id(first_keywords)self.group_id_list.append(group_id)else:# 2.内容查找content_keyword = self.article_content_filter()if content_keyword:first_keywords = content_keyword.get('first_keywords')if isinstance(first_keywords, str):group_id = self.get_keyword_group_id(first_keywords)self.group_id_list.append(group_id)elif isinstance(first_keywords, list):for first_keyword in first_keywords:group_id = self.get_keyword_group_id(first_keyword)self.group_id_list.append(group_id)else:second_keywords = content_keyword.get('second_keywords')if isinstance(second_keywords, str):group_id = self.get_keyword_group_id(second_keywords)self.group_id_list.append(group_id)elif isinstance(second_keywords, list):for second_keyword in second_keywords:group_id = self.get_keyword_group_id(second_keyword)self.group_id_list.append(group_id)else:self.group_id_list = Noneelse:self.group_id_list = Nonereturn self.group_id_list# 选取出现频次最高的关键字@staticmethoddef select_high(keyword_dict):ls = sorted(list(keyword_dict.items()), key=lambda a: a[1], reverse=True)index = 0for i, x in enumerate(ls):if x[1] == ls[0][1]:index = i + 1else:breakprint((ls[:index]))return ls[:index]# Redis取出关键词对应的文章分类IDdef get_keyword_group_id(self, keyword):article_group_id = self.redis_client.hget('group_id_of_keyword', keyword)return article_group_id# 文章敏感词过滤def sensitive_words_filter(self):try:sensitive_words = self.redis_client.get('sensitive_words')if sensitive_words:sensitive_words = sensitive_words.split(',')text = ''.join([item.get('text') for item in self.content if item.get('text')])for sensitive_word in sensitive_words:resp_title = self.title.find(sensitive_word)resp_content = text.find(sensitive_word)if resp_title != -1 or resp_content != -1:return Trueelse:return Falseelse:return Falseexcept Exception as e:return Falseclass huxiu_spider(object):def __init__(self):self.base_url = 'https://www.huxiu.com/'self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0'}def send_request(self, url):response = requests.get(url, headers=self.headers)text = response.textreturn text# 文章列表def first_analysis(self, text):selector = etree.HTML(text)results = selector.xpath('//*[@id="index"]/div[2]/div[2]/div')# //*[@id="index"]/div[1]/div[2]/div[9]/div[1]/a/div/@stylenew_list = []i = 1for res in results:res_dict = {}web_name = '虎嗅网'res_dict['web_name'] = web_name# 文章标题title = res.xpath('div[1]/h2/a/text()')[0]print('正在爬取第%s篇文章,标题是:%s' % (i, title))num = self.get_title(title, web_name)print('查看文章是否存在=====')if num == 0:print('文章不存在～～～')url = res.xpath('div/h2/a[starts-with(@href, "/article")]/@href')[0]article_link = 'https://www.huxiu.com' + urlarticle_content, article_time = self.second_analysis(article_link)if article_content != 1:print('敏感词开始过滤')# 本地敏感关键词过滤article_filter_obj = ArticleFilter(title, article_content)resp = article_filter_obj.sensitive_words_filter()if resp:print('文章存在敏感词汇')else:# 文章内容res_dict['content'] = article_content# 文章发布时间res_dict['date'] = article_time# 文章内容链接res_dict['article_link'] = article_link# 文章标题res_dict['title'] = title# 文章简介summary = res.xpath('div/div[2]/text()')[0]res_dict['summary'] = summary# 文章作者name = res.xpath('div/div/a/span/text()')[0]res_dict["name"] = name# 文章作者链接# res_dict["author_link"] = 'https://www.huxiu.com' + res.xpath('div/div/a/@href')[0]# 文章列表主图if res.xpath('div/a/img/@data-original'):min_pic = res.xpath('div/a/img/@data-original')[0]oss_url = self.upload_oss(min_pic)# oss_url = oss_url.replace('http', 'https')res_dict["min_pic"] = oss_urlelif res.xpath('a/div/img/@data-original'):min_pic = res.xpath('a/div/img/@data-original')[0]oss_url = self.upload_oss(min_pic)# oss_url = oss_url.replace('http', 'https')res_dict["min_pic"] = oss_urlelif res.xpath('div/a/div/@style'):# 截取图片是视频样式的mystr = res.xpath('div/a/div/@style')[0]print(111, mystr)start_index = mystr.find('(', 0, len(mystr))end_index = mystr.find('?', 0, len(mystr))min_pic = mystr[start_index + 2:end_index]print(123, min_pic)oss_url = self.upload_oss(min_pic)print(321, oss_url)# oss_url = oss_url.replace('http', 'https')res_dict["min_pic"] = oss_urlelse:oss_url = ''res_dict["min_pic"] = oss_urlself.upload_mongo(res_dict)self.upload_mysql(title, name, article_time, oss_url, summary, web_name, article_link)print('成功获取并保存第%s篇文章' % i)i += 1new_list.append(res_dict)else:i += 1continueelse:i += 1continueprint('成功获取到%s篇文章' % (i - 1))# 文章内容def second_analysis(self, url):try:# 自定义PhantomJS的请求头cap = DesiredCapabilities.PHANTOMJS.copy()for key, value in self.headers.items():cap['phantomjs.page.customHeaders.{}'.format(key)] = valuebrowser = webdriver.PhantomJS('/usr/local/lib/phantomjs-2.1.1-linux-x86_64/bin/phantomjs')browser.get(url)time.sleep(3)html = browser.page_source# 选取文章发布时间selector = etree.HTML(html)if selector.xpath('//div[@class="column-link-box"]/span[1]/text()'):article_time = selector.xpath('//div[@class="column-link-box"]/span[1]/text()')[0]print(article_time)# //*[@id="article_content301428"]/p[138]/span[2]/text() ---new# //*[@class="article-author"]/span[2]/text() ---oldelif selector.xpath('//*[@id="article_content301428"]/p[138]/span[2]/text() '):article_time = selector.xpath('//*[@id="article_content301428"]/p[138]/span[2]/text() ')[0]else:article_time = ''# 文章内头图if selector.xpath('//div[@class="article-img-box"]/img/@src'):article_min_pic = selector.xpath('//div[@class="article-img-box"]/img/@src')[0]else:article_min_pic = ""# 选取文章内容content = selector.xpath('//*[@class="article-content-wrap"]')[0]result = etree.tostring(content, method='html')print('获取到文章内容')# 获取bs4对象soup = BeautifulSoup(result, 'html.parser', from_encoding='utf-8')new_list = []# 通过标签来获取内容ls = soup.find_all(["p", "img"])for table in ls:res = {}data = table.get_text()if data:# # 去除空字符和特殊字符new_data = "".join(data.split())new_data = new_data.replace(u'\ufeff', '')if new_data != "":res["text"] = new_datanew_list.append(res)link = table.get('src')if link:oss_url = self.upload_oss(link)res["img"] = oss_urlnew_list.append(res)if article_min_pic != '':article_min_pic = self.upload_oss(article_min_pic)# article_min_pic = article_min_pic.replace('http', 'https')new_list.insert(0, {'img': article_min_pic})browser.quit()return new_list, article_timeexcept Exception as e:print('文章不存在了', e)return 1, 1# 上传图片到ossdef upload_oss(self, url):kw = {'fileurl': url,'filepath': 'gander_goose/dev/test2'}result = requests.post(url='http://api.max-digital.cn/Api/oss/uploadByUrl', data=kw)result = result.json()oss_url = result.get('oss_file_url')oss_url = oss_url.replace('maxpr.oss-cn-shanghai.aliyuncs.com', 'cdn.max-digital.cn')oss_url = oss_url.replace('http', 'https')return oss_url# 数据上传mongodef upload_mongo(self, article_dict):try:client = MongoClient('127.0.0.1', 27017)my_db = client.wechatmy_db.articles.insert_one(article_dict)print('上传到mongo成功')except Exception as e:print('上传到mongo失败:', e)# 插入到mysqldef upload_mysql(self, title, name, date, oss_url, summary, web_name, link):try:# 上传mysql# 创建Connection连接conn = connect(host='localhost', port=3306, database='wechat',user='root', password='mysql', charset='utf8')# 获得Cursor对象cs1 = conn.cursor()# 执行insert语句，并返回受影响的行数：添加一条数据# 增加now = datetime.datetime.now()imgurl = "https://cdn.max-digital.cn/gander_goose/dev/test2/15368082362561.jpg"sql1 = "insert into article_info (title,author,wechat_art_date,min_pic,summary,web_name,is_show,is_big,link,round_head_img,create_time) values ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')" % (title, name, date, oss_url, summary, web_name, 0, 0, link, imgurl, now)cs1.execute(sql1)# 获取最新插入的文章的IDnew_article_id = int(conn.insert_id())# 修改分类--24小时下文章的自定义排序值# sql2 = 'update article_group set sort_num = sort_num + 1 where group_id=1'# cs1.execute(sql2)# 上线到24小时分类sql3 = 'insert into article_group (article_id,group_id,sort_num,create_time) values ("%s", "%s", "%s", "%s")' % (new_article_id, 1, 1, now)cs1.execute(sql3)# 修改文章上线状态sql4 = "update article_info set is_show = 1, zj_art_date='%s' where id='%s'" % (now, new_article_id)cs1.execute(sql4)conn.commit()cs1.close()conn.close()print('上传到mysql成功')except Exception as e:print('mysql上传失败:', e)def get_title(self, title, query):# 查询mysqlconn = connect(host='127.0.0.1', port=3306,database='zj',user='root', password='mysql', charset='utf8')# 获得Cursor对象cs1 = conn.cursor()res = 'select * from article_info where title = "%s" and web_name = "%s" ' % (title, query)num = cs1.execute(res)return numdef run(self):text = self.send_request(self.base_url)self.first_analysis(text)if __name__ == '__main__':huxiu = huxiu_spider()while True:start_time = time.time()print('开始时间:', time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(start_time)))huxiu.run()time.sleep(3600)

转载于:https://my.oschina.net/u/3892643/blog/3055002

requests爬虎妞相关推荐

python requests 爬取数据
python requests 爬取数据 import requests from lxml import etree import time import pymysql import json h ...
Crawl：利用bs4和requests爬取了国内顶级某房源平台(2020年7月2日上海二手房)将近30*100多条数据并进行房价分析以及预测
Crawl:利用bs4和requests爬取了国内顶级某房源平台(2020年7月2日上海二手房)将近30*100多条数据并进行房价分析以及预测目录利用bs4和requests爬取了国内顶级某房源平 ...
day02:requests爬取豆瓣电影信息
一.requests爬取豆瓣电影信息 -请求url: https://movie.douban.com/top250 -请求方式: get -请求头: user-agent cookies二.sele ...
python3.x+requests 爬取网站遇到中文乱码的解决方案
正常情况下,遇见问题上google找答案能甩百度100条街,但是这个问题是个例外······人家老外就没有乱码的问题.言归正传,首先建议大家看一下python3.x+requests 爬取网站遇到中文 ...
python爬取百度搜索_使用Python + requests爬取百度搜索页面
想学一下怎样用python爬取百度搜索页面,因为是第一次接触爬虫,遇到一些问题,把解决过程与大家分享一下 1.使用requests爬取网页首先爬取百度主页www.baidu.com import r ...
使用requests爬取实习僧网站数据
任务要求: 爬取实习僧网站的招聘公司信息和职位信息,并存储到数据库中,对应的数据库表和需要爬取的字段见下面表一和表二(注意:爬取存在的字段) 代码以上传带github上:使用requests爬取实习僧 ...
requests爬取IT橘子
requests爬取IT橘子 import requests import jsondef get_html(url):headers = {"User-Agent": " ...
【爬虫案例】Requests爬取豆瓣短评以及入门爬虫注意事项
一.Requests是什么? Requests 是一个 Python 的 HTTP 客户端库. 支持的 HTTP 特性: 保持活动和连接池.国际域名和URL.Cookie持久性会话浏览器式SSL验证 ...
【爬虫】使用requests爬取英雄联盟英雄皮肤
使用requests爬取英雄联盟英雄皮肤自己做的 import requestsresponse = requests.get("https://game.gtimg.cn/images/ ...

requests爬虎妞

requests爬虎妞相关推荐

最新文章

热门文章