背景介绍

通过自动点击页面来抓取文章信息。这个脚本对于用来看某个关键词在近几年的研究趋势很有用~
半自动：当遇到谷歌人机验证，需要手动完成。
注意将selenium升级到最新版本，他会自动下载chrome内核。
可对脚本修改，来抓取更多数据、或者统计更多信息。
注释非常详细；
需要什么检索规则，可以根据原官网检索后的URL，修改代码中的URL；

运行效果

未来改进

抓取一页，写入一页，免得中途崩溃数据全没；
更多异常页面检测（目前很少遇到）；
抓取和整理更多信息；
记录坐标，断点续抓；
更换ip；
发现谷歌学术只能搜100页，ε=(´ο｀*)))唉；

参考代码

先在这里下一个驱动：https://chromedriver.storage.googleapis.com/index.html?path=114.0.5735.90/

import random
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import re
from tqdm import tqdm
import pyautogui
from bs4 import BeautifulSoup
import lxml
import pandas as pd
from enum import Enum
import pyperclip
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import Counter
import os
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')class Errors(Enum):SUCCESS         = '成功'SERVER_ERROR    = '服务器错误'class Scholar:def __init__(self, out_filepath) -> None:self.out_filepath = out_filepathif not os.path.exists(self.out_filepath):os.mkdir(self.out_filepath)self.driver = Noneself.results = []def start_browser(self, wait_time=10):# 创建ChromeOptions对象options = Options()# 启用无头模式# options.add_argument("--headless")# 启用无痕模式options.add_argument("--incognito")options.add_argument("--disable-domain-reliability")options.add_argument("--disable-blink-features=AutomationControlled")options.add_argument("--disable-client-side-phishing-detection")options.add_argument("--no-first-run")options.add_argument("--use-fake-device-for-media-stream")options.add_argument("--autoplay-policy=user-gesture-required")options.add_argument("--disable-features=ScriptStreaming")options.add_argument("--disable-notifications")options.add_argument("--disable-popup-blocking")options.add_argument("--disable-save-password-bubble")options.add_argument("--mute-audio")options.add_argument("--no-sandbox")options.add_argument("--disable-gpu")options.add_argument("--disable-extensions")options.add_argument("--disable-software-rasterizer")options.add_argument("--disable-dev-shm-usage")options.add_argument("--disable-webgl")options.add_argument("--allow-running-insecure-content")options.add_argument("--no-default-browser-check")options.add_argument("--disable-full-form-autofill-ios")options.add_argument("--disable-autofill-keyboard-accessory-view[8]")options.add_argument("--disable-single-click-autofill")options.add_argument("--ignore-certificate-errors")options.add_argument("--disable-infobars")options.add_argument("--disable-blink-features=AutomationControlled")options.add_argument("--disable-blink-features")# 禁用实验性QUIC协议options.add_experimental_option("excludeSwitches", ["enable-quic"])# 创建Chrome浏览器实例self.driver = webdriver.Chrome(options=options, service=Service(ChromeDriverManager(version="114.0.5735.90").install()))self.driver.maximize_window()# 等待页面加载完成self.driver.implicitly_wait(wait_time)def __search_onepage(self):"""爬取当前页面文章的的信息"""results = []if not self.check_element_exist(check_type='ID', value='gs_res_ccl_mid'):print('>> 当前页面不存在文章列表')return []gs_scl = self.driver.find_element(by=By.ID, value='gs_res_ccl_mid').find_elements(by=By.CLASS_NAME, value='gs_scl')for i, item in tqdm(enumerate(gs_scl)):gs_rt = item.find_element(by=By.CLASS_NAME, value='gs_rt')gs_a = item.find_element(by=By.CLASS_NAME, value='gs_a')gs_rt_a = gs_rt.find_element(by=By.TAG_NAME, value='a') if self.check_element_exist(check_type='TAG_NAME', value='a', source=gs_rt.get_attribute('innerHTML')) else Nonepublisher_info = gs_a.text.strip().replace('\n', '')# 论文标题title = gs_rt.text.strip().replace('\n', '').split(']')[-1].strip()# 论文链接href = gs_rt_a.get_attribute('href') if gs_rt_a else ''# 发表年份year = re.findall(r'\d{4}', publisher_info)year = year[-1] if year else -1# print(f'[{i}] {title} => {href} => {publisher_info} => {year}')results.append({'title': title, 'href':href, 'year': year})return resultsdef check_element_exist(self, value, check_type='CLASS_NAME', source=None) -> bool:"""检查页面是否存在指定元素"""page_source = source if source else self.driver.page_sourcesoup = BeautifulSoup(page_source, 'lxml')if check_type == 'ID':return len(soup.find_all(id=value)) != 0elif check_type == 'CLASS_NAME':return len(soup.find_all(class_=value)) != 0elif check_type == 'TAG_NAME':return len(soup.find_all(value)) != 0elif check_type == 'FULL':return value in page_sourceelse:print(f'>> 检查条件[{check_type}]不对')return Falsedef check_captcha(self) -> bool:"""检查是否需要人机验证；一个是谷歌学术的、一个是谷歌搜索的"""return self.check_element_exist(check_type='ID', value='gs_captcha_f') or \self.check_element_exist(check_type='ID', value='captcha-form')def process_error(self, error: Errors) -> bool:"""尽可能尝试解决错误"""success = Falseif error == Errors.SERVER_ERROR:passreturn successdef check_error(self, try_solve = True) -> Errors:"""检查当前页面是否出错"""error = Errors.SUCCESSif self.check_element_exist(check_type='FULL', value='服务器错误'):error = Errors.SERVER_ERROR# 尝试解决错误if try_solve and error != Errors.SUCCESS:error = Errors.SUCCESS if self.process_error(error) else errorreturn errordef __scroll2bottom(self):# 将滚动条移动到页面的底部self.driver.switch_to.default_content()js = "var q=document.documentElement.scrollTop=100000"self.driver.execute_script(js)def search(self, keywords, sort_bydate=False, as_ylo='', as_yhi='', max_pages=100, delay=0):keywords = keywords.replace(' ', '+')sort_bydate = 'scisbd=1' if sort_bydate else ''url = f'https://scholar.google.com/scholar?{sort_bydate}&hl=zh-CN&as_sdt=0%2C5&q={keywords}&btnG=&as_ylo={as_ylo}&as_yhi={as_yhi}'# 打开Google Scholar网站self.driver.get(url)for _ in tqdm(range(1, max_pages+1), desc='搜索中'):while self.check_captcha():pyautogui.alert(title='状态异常', text='请手动完成人机验证后，点击“已完成”', button='已完成')self.driver.refresh()time.sleep(2)if self.check_error() != Errors.SUCCESS:if pyautogui.confirm(text='请检查页面出现了什么问题;\n解决后，点击“确定”将会重试;\n否则，点击“取消”提前结束脚本;', title='状态异常', buttons=['确定', '取消']) == '取消':print('>> 提前结束')breaktime.sleep(2)onepage = self.__search_onepage()if not onepage:print('>> 当前页为空, 重试')self.driver.refresh()time.sleep(2)continueself.results.extend(onepage)if not self.check_element_exist(check_type='CLASS_NAME', value='gs_ico_nav_next'):print('>> 全部结束')breakself.__scroll2bottom()time.sleep(0.1)self.driver.find_element(by=By.CLASS_NAME, value="gs_ico_nav_next").click()time.sleep(delay)total_num = self.driver.find_element(by=By.ID, value='gs_ab_md').find_element(by=By.CLASS_NAME, value='gs_ab_mdw').text.strip()  # .replace('\n', '').split(',')[:-1]open(os.path.join(self.out_filepath, 'total_num.txt'), 'w+').write(''.join(total_num))return self.resultsdef close_browser(self):# 关闭浏览器self.driver.quit()def save_file(self, filename='scholar.xlsx', nodup=False):unique_data = self.resultsif nodup:# 根据href字段进行去重unique_data = [dict(t) for t in {tuple(d.items()) for d in unique_data}]print(f'>> 去重效果：{len(self.results)} => {len(unique_data)}')try:pd.DataFrame(unique_data).dropna().reset_index(drop=True).to_excel(os.path.join(self.out_filepath, filename), index=False, encoding='utf-8')except Exception as e:if pyautogui.confirm(text=f'文件保存失败[{str(e)}]\n点击“确定”将内容复制到剪切板;\n否则, 点击“取消”直接结束脚本;', title='文件保存失败', buttons=['确定', '取消']) == '确定':pyperclip.copy(str(unique_data))def statistical_information(self):passclass AnalyzeDraw:def __init__(self, out_filepath, filename='scholar.xlsx') -> None:self.out_filepath = out_filepathif not os.path.exists(self.out_filepath):os.mkdir(self.out_filepath)self.filename = filenameself.df = pd.read_excel(os.path.join(self.out_filepath, filename))def draw_wordcloud(self):"""提取title生成词云"""# 定义停用词集合english_stopwords = set(stopwords.words('english'))# 清洗和转换标题列self.df['title'] = self.df['title'].astype(str)# 提取英文标题并排除非英文内容english_titles = self.df['title'].apply(lambda x: ' '.join([word.lower() for word in nltk.word_tokenize(x) if word.isalpha() and word.lower() not in english_stopwords]))# 将所有英文标题合并为一个字符串text = ' '.join(english_titles)# 创建词云对象wc = WordCloud(width=800, height=400, background_color='white').generate(text)wc.to_file(os.path.join(self.out_filepath, f'{self.filename}.jpg'))def draw_wordsfrequency(self):# 停用词列表stop_words = ['a', 'an', 'and', 'or', 'in', 'on', 'for', 'with', 'the', 'using', 'based', 'to', 'by', 'its', 'it', '&', 'as', 'via', 'base', 'improve', 'improved',]# 分词并计算词频word_counts = Counter(' '.join(self.df['title']).lower().split())# 去除停用词for stop_word in stop_words:word_counts.pop(stop_word, None)# 按照词频从高到低排序sorted_counts = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)# 提取词和频率words = [item[0] for item in sorted_counts]freqs = [item[1] for item in sorted_counts]# 创建DataFrame保存词频数据df_freq = pd.DataFrame({'Word': words, 'Frequency': freqs})# 保存词频数据到Excel文件df_freq.to_excel(os.path.join(self.out_filepath, 'word_frequency.xlsx'), index=False)if __name__ == '__main__':keywords = input('>> 请输入搜索关键词: ').strip() or 'allintitle: Multimodal ("Graph Neural Network" OR GNN)'as_ylo = input('>> 请输入开始年份(留空为1900): ').strip() or '1900'as_yhi = input('>> 请输入结束年份(留空为不限): ').strip()max_pages = input('>> 请输入爬取多少页(最多为100): ').strip() or '100'sort_bydate = (input('>> 是否按日期排序(y/n, 默认否, 会覆盖年份): ').strip() or 'n')=='y'out_filepath = '_'.join(keywords.replace('"', '').replace(':', '').split())scholar = Scholar(out_filepath)scholar.start_browser(wait_time=60)results = scholar.search(keywords, sort_bydate, as_ylo, as_yhi, max_pages=int(max_pages), delay=random.randint(0, 0)) scholar.close_browser()scholar.save_file(nodup=True)analyze = AnalyzeDraw(out_filepath)analyze.draw_wordcloud()analyze.draw_wordsfrequency()print('>> all done <<')

【教程】爬取和统计Google Scholar上指定关键词的文章信息相关推荐

python爬取汽车之家_python爬虫实战之爬取汽车之家网站上的图片
随着生活水平的提高和快节奏生活的发展.汽车开始慢慢成为人们的必需品,浏览各种汽车网站便成为购买合适.喜欢车辆的前提.例如汽车之家网站中就有最新的报价和图片以及汽车的相关内容,是提供信息最快最全的中国汽 ...
爬取唯美女生网站上所有小姐姐的照片
爬取唯美女生网站前几天刚好看到有人在教学爬取唯美女生网站(https://www.vmgirls.com)的图片,大致听了一下,只是介绍了一次爬取某一个小姐姐的照片,因此就尝试做一个爬取全站小姐姐照 ...
python房价数据分析统计服_Python 爬取分析全国 12 个城市 4 万条房价信息，告诉你该怎样买房？...
原标题:Python 爬取分析全国 12 个城市 4 万条房价信息,告诉你该怎样买房? 作者 | 月小水长责编 | 伍杏玲通过分页.线程池.代理池等技术,快速爬取链家网近4万条在售二手房信息,速度可 ...
python 爬取https://wall.alphacoders.com上的壁纸(入门级别)
python 爬取https://wall.alphacoders.com上的壁纸 0,环境 python3.7 库:requests,BeautifulSoup4 1,目标 https://wall ...
Python | 使用Python爬取Wallhaven网站壁纸并上传百度网盘
更多详情请查看Honker Python | 使用Python爬取Wallhaven网站壁纸并上传百度网盘给大家推荐一款超好用的壁纸下载网站-- wallhaven 第一次知道这个网站的时候,惊为天 ...
Python运用urllib2和BeautifulSoup爬取网站ZOL桌面壁纸上的精美电脑壁纸
Python运用urllib2和BeautifulSoup爬取网站ZOL桌面壁纸上的精美电脑壁纸 #!/usr/bin/env python # -*- coding: utf-8 -*- # @Ti ...
爬取全民K歌主页上的歌
爬取全民K歌主页上的歌 import urllib.request import requests import os import repath = "C:\\Users\\HUAWEI\ ...
python爬取网站时，一键获取headers、url等信息（真的是让我爬取网站时，省了不少力气，作为小秘密分享给大家喽）
python爬取网站时,一键获取headers.url等信息 (真的是让我爬取网站时,省了不少力气,作为小秘密分享给大家喽) 作为Python的使用者,我今天就和大家来分享分享我学习python的经验 ...
scrapy 解析css,Scrapy基础(六)————Scrapy爬取伯乐在线一通过css和xpath解析文章字段...
上次我们介绍了scrapy的安装和加入debug的main文件,这次重要介绍创建的爬虫的基本爬取有用信息通过命令(这篇博文)创建了jobbole这个爬虫,并且生成了jobbole.py这个文件,又写 ...

【教程】爬取和统计Google Scholar上指定关键词的文章信息

背景介绍

运行效果

未来改进

参考代码

【教程】爬取和统计Google Scholar上指定关键词的文章信息相关推荐

最新文章

热门文章