imooc疯狂的蚂蚁《Python开发简单爬虫》源代码

以下为imooc疯狂的蚂蚁《Python开发简单爬虫》源代码，调试有些问题。

html_downloader.py

import html_downloader
import html_output
import html_parser
import url_manager
import tracebackclass SpiderMain:def __init__(self):#初始化所需的对象，包括url管理器，网页下载器，网页解析器，输出器提供给craw（）使用self.urls=url_manager.UrlManager()self.downloader=html_downloader.HtmlDownloader()self.parser=html_parser.HtmlParser()self.outputer=html_output.HtmlOutputer()def craw(self,url):#一个计数count=1#添加根urlself.urls.add_new_url(url)#开始解析while self.urls.has_new_url():try:#获取urlnew_url=self.urls.get_new_url()print('第%d个url：%s'%(count,new_url))#将该url的页面进行下载html_cont=self.downloader.download(new_url)#对下载的页面内容进行解析，存入两个变量中new_urls,new_data= self.parser.parse(new_url,html_cont)#将解析获得的新urls添加到url管理器中self.urls.add_new_urls(new_urls)#将数据进行收集self.outputer.collect_data(new_data)if count==1000:breakcount+=1except:print('爬取失败')traceback.print_exc()#将收集的数据输出为一个htmlself.outputer.output_html()if __name__=='__main__':root_url='http://baike.baidu.com/view/21087.htm'#root_url ='https://baike.baidu.com/item/Python/407313'obj_spider=SpiderMain()obj_spider.craw(root_url)

html_output.py

class HtmlOutputer(object):#手机数据需要一个列表进行维护def __init__(self):self.datas=[]def collect_data(self, new_data):if new_data is None:returnself.datas.append(new_data)#输出一个html文档def output_html(self):fileout=open('output.html','w',encoding='utf-8')fileout.write('<html>')fileout.write('<head>')fileout.write('<meta charset=\'utf-8\'>')fileout.write('</head>')fileout.write('<body>')fileout.write('<table>')for data in self.datas:fileout.write('<tr>')fileout.write('<td>%s</td>' % data['url'])fileout.write('<td>%s</td>' % data['title'])fileout.write('<td>%s</td>' % data['summary'])fileout.write('</tr>')fileout.write('</table>')fileout.write('</body>')fileout.write('</html>')fileout.close()

html_parser.py

import re
import urllib.parse
from bs4 import BeautifulSoupclass HtmlParser(object):#对html_cont的内容进行解析def parse(self, page_url, html_cont):if page_url is None or html_cont is None:returnsoup=BeautifulSoup(html_cont,'html.parser',from_encoding='utf-8')new_urls=self._get_new_urls(page_url,soup)new_data=self._get_new_data(page_url,soup)return new_urls,new_data#获取页面上所有的urldef _get_new_urls(self, page_url, soup):new_urls=set()#根据分析，链接的格式为：/view/12334.htm#/item/%E9%98%BF%E5%A7%86%E6%96%AF%E7%89%B9%E4%B8%B9/2259975links=soup.find_all('a',href=re.compile(r'/view/\d+\.htm'))for link in links:new_url=link['href']#url格式需要进行拼接，加上http://baike.baidu.comnew_full_url=urllib.parse.urljoin(page_url,new_url)new_urls.add(new_full_url)return new_urls#获取该页面的数据，包含url、标题、简介def _get_new_data(self, page_url, soup):#以一个词典类型保存数据res_data={}#保存urlres_data['url']=page_url#下面是标题的格式#<dd class="lemmaWgt-lemmaTitle-title"> <h1>Python</h1>title_node=soup.find('dd',class_="lemmaWgt-lemmaTitle-title").find('h1')res_data['title']=title_node.get_text()#开始获取简介的内容#<div class="lemma-summary" label-module="lemmaSummary">summary_node=soup.find('div',class_="lemma-summary")res_data['summary']=summary_node.get_text()return res_data

spider_main.py

import html_downloader
import html_output
import html_parser
import url_manager
import tracebackclass SpiderMain:def __init__(self):#初始化所需的对象，包括url管理器，网页下载器，网页解析器，输出器提供给craw（）使用self.urls=url_manager.UrlManager()self.downloader=html_downloader.HtmlDownloader()self.parser=html_parser.HtmlParser()self.outputer=html_output.HtmlOutputer()def craw(self,url):#一个计数count=1#添加根urlself.urls.add_new_url(url)#开始解析while self.urls.has_new_url():try:#获取urlnew_url=self.urls.get_new_url()print('第%d个url：%s'%(count,new_url))#将该url的页面进行下载html_cont=self.downloader.download(new_url)#对下载的页面内容进行解析，存入两个变量中new_urls,new_data= self.parser.parse(new_url,html_cont)#将解析获得的新urls添加到url管理器中self.urls.add_new_urls(new_urls)#将数据进行收集self.outputer.collect_data(new_data)if count==1000:breakcount+=1except:print('爬取失败')traceback.print_exc()#将收集的数据输出为一个htmlself.outputer.output_html()if __name__=='__main__':root_url='http://baike.baidu.com/view/21087.htm'#root_url ='https://baike.baidu.com/item/Python/407313'obj_spider=SpiderMain()obj_spider.craw(root_url)

url_manager.py

class UrlManager(object):def __init__(self):self.new_urls=set()self.old_urls=set()#添加一个新的urldef add_new_url(self, url):if url is None:return#当该url既不在新列表中也不在旧列表中，则添加到新列表中if url not in self.new_urls and url not in self.old_urls:self.new_urls.add(url)#判断是否还有新的待爬去的urldef has_new_url(self):return len(self.new_urls)!=0#返回一个新的url，有出入操作def get_new_url(self):new_url=self.new_urls.pop()self.old_urls.add(new_url)return new_url#添加一组新urldef add_new_urls(self, new_urls):if new_urls is None or len(new_urls)==0:returnfor new_url in new_urls:self.add_new_url(new_url)

imooc疯狂的蚂蚁《Python开发简单爬虫》源代码相关推荐

python秒懂百科视频,Python开发简单爬虫
Python开发简单爬虫源码网址: http://download.csdn.NET/detail/hanchaobiao/9860671 一.爬虫的简介及爬虫技术价值 1.什么是爬虫: 一段自动 ...
Python开发简单爬虫 - 慕课网
课程链接:Python开发简单爬虫环境搭建: Eclipse+PyDev配置搭建Python开发环境 Python入门基础教程用Eclipse编写Python程序课程目录第1章课程介绍 1- ...
python简单爬虫程序分析_[Python专题学习]-python开发简单爬虫
掌握开发轻量级爬虫,这里的案例是不需要登录的静态网页抓取.涉及爬虫简介.简单爬虫架构.URL管理器.网页下载器(urllib2).网页解析器(BeautifulSoup) 一.爬虫简介以及爬虫的技术价 ...
python一般用来开发什么-python主要用来做什么？Python开发简单吗？
python主要用来做什么?Python开发简单吗?Python技术可做web开发.Python技术可做数据分析.Python技术可做人工智能.将Python用于机器学习,流行的Python机器学习库 ...
python主要用来做什么-python主要用来做什么？Python开发简单吗？
python主要用来做什么?Python开发简单吗?Python技术可做web开发.Python技术可做数据分析.Python技术可做人工智能.将Python用于机器学习,流行的Python机器学习库 ...
c语言实现爬虫功能,用C/C 扩展Python语言_python 调用c语言 python实现简单爬虫功能_python实现简单爬虫...
用C/C 扩展Python语言 Python是一门功能强大的脚本语言,它的强大不仅表现在功能上,还表现在其扩展性上.她提供大量的API以方便程序员利用C/C++对Python进行扩展.因为执行速度慢几 ...
python_2开发简单爬虫
2017年12月03日 16:43:01 独行侠的守望阅读数:204 标签: python爬虫更多个人分类: Python 编辑版权声明:本文为博主原创文章,转载请注明文章链接. https: ...
记事本写python怎么运行-Python开发简单记事本
摘要: 本文是使用Python,结合Tkinter开发简单记事本. 本文的操作环境:ubuntu,Python2.7,采用的是Pycharm进行代码编辑,个人很喜欢它的代码自动补齐功能. 最近很想对p ...
python实现简单爬虫抓取图片
最近在学习python,正如大家所知,python在网络爬虫方面有着广泛的应用,下面是一个利用python程序抓取网络图片的简单程序,可以批量下载一个网站更新的图片,其中使用了代理IP的技术. imp ...

imooc疯狂的蚂蚁《Python开发简单爬虫》源代码

imooc疯狂的蚂蚁《Python开发简单爬虫》源代码相关推荐

最新文章

热门文章