

A ) 爬取目标:

比如我们现在需要爬取网页https://brands.cnblogs.com/ 中资讯列表数据。




B ) Html标签分析:
















专栏名称:div div div div div div h1 a;

专栏副标题:div.wrapper div.bannerpage div div div div p,副标题里面有两个P标签,获取时需要foreach循环获取;

文章页数:div div div div div div ul li a;

文章标题:div.article-item div.title h2;

文章摘要:div.article-item div.summary p;

文章作者:div.article-item div.post-info div.author span;

浏览数量:div.article-item div.post-info div.views span;

点赞人数:div.article-item div.post-info div.likes span;

缩略图片:div.article-item figure a img;

详情地址:div.article-item div.title h2 a;






A )添加路由:


Route::group('html/suppplier', function () {Route::get('crawler_news', '/crawlerNews')->name('htmlSupplierCrawlerNews');//爬虫爬取资讯数据})->prefix('html.Supplier');

B )实现刚配置的接口函数:


    /*** TODO 爬虫爬取资讯数据* @return mixed* @author huidaoli* @day 2021-11-17*/public function crawlerNews(EchartRepository $EchartRepository){$data = $this->request->params(['page']);$res = $EchartRepository->getHtmlTags($data);return app('json')->success($res);}

C )爬取专栏相关信息:


    //爬虫爬取资讯数据public function getHtmlTags(array $data){$htmlObj = new simple_html_dom();   //工具类对象初始化$htmlObj->load_file('https://brands.cnblogs.com/');  // 从url中加载$data['page'] = $htmlObj->find('div div div div div div ul li a', 3)->innertext;$ret = $this->scraping_digg($data);$ret['titles'] = $htmlObj->find('div div div div div div h1 a', 0)->innertext;foreach($htmlObj->find('div.wrapper div.bannerpage div div div div p') as $e){$ret['subtitle'][] = $e->plaintext;}return $ret;}

D )爬取文章信息:


    //爬虫爬取资讯数据public function scraping_digg(array $data) {   $limit = 0;for($i=1;$i<=$data['page'];$i++){$URL = 'https://brands.cnblogs.com';$url = $URL.'/'.$i;$html = new simple_html_dom();$html->load_file($url);foreach($html->find('div.article-item') as $article) {$item['title'] = trim($article->find('div.title h2', 0)->plaintext);$item['details'] = trim($article->find('div.summary p', 0)->plaintext);$item['diggs1'] = trim($article->find('div.post-info div.author span', 0)->plaintext);$item['diggs2'] = trim($article->find('div.post-info div.views span', 0)->plaintext);$item['diggs3'] = trim($article->find('div.post-info div.likes span', 0)->plaintext);$item['img'] = trim($article->find('figure a img', 0)->src);$item['href'] = $URL.trim($article->find('div.title h2 a', 1)->href);$ret['list'][] = $item;$limit += 1;}$html->clear();unset($html);}$ret['limit'] =  $limit;$ret['page'] =  $data['page'];return $ret;}



{"status": 200,"message": "success","data": {"list": [{"title": "赶紧收藏!7大类400多种组件,鸿蒙三方库来了!","details": "目前,HarmonyOS三方库涵盖7大类400多种,并且还在持续壮大中。以下为三方库的介绍及资源地址,赶紧收藏!","diggs1": "eva3w","diggs2": "3075","diggs3": "0","img": "https://img2020.cnblogs.com/blog/2016690/202105/2016690-20210512143438411-894995434.png?v=20200821","href": "https://brands.cnblogs.com/huawei/p/2174"},{"title": "上云没错,上错了云,才是错!","details": "疫情更像加速器,加速了企业云化的进程。企业 IT 需要更弹性和稳定的基础架构、更先进并持续更新的技术栈、永远在线的体验和有保障的 SLA。 「上云」是不可逆转的趋势,但是,上云要找老司机,选择科学的方法和正确的步调。 上错了云,就像上贼船,...","diggs1": "小黑羊","diggs2": "1510","diggs3": "7","img": "https://img2020.cnblogs.com/blog/1/202102/1-20210224163516159-1710022906.jpg?v=20200821","href": "https://brands.cnblogs.com/aws/p/2081"}],"limit": 3,"page": "1","titles": "博客园品牌专区","subtitle": ["你的远见,被千万开发者看见","Stay with developers. Build your brand."]}




