Python爬虫诗词名句网教程

爬取的基本步骤

一、明确需求

爬取网站内的小说名

小说内容

二、代码讲解

下面根据代码，从浅入深给大家讲解分析一遍

-- codeing = utf-8 --，开头的这个是设置编码为utf-8 ，写在开头，防止乱码

然后下面 import就是导入一些库，做做准备工作

import os
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import requests
import SQL
import pinyin
from bs4 import BeautifulSoup
from pymysql import *

首先我们要伪装成一个浏览器，再去访问我们需要爬取的网站

百度百科：
User Agent中文名为用户代理，简称 UA，它是一个特殊字符串头，使得服务器能够识别客户使用的操作系统及版本、CPU 类型、浏览器及版本、浏览器渲染引擎、浏览器语言、浏览器插件等。

以Chrome浏览器为例，在浏览器地址栏输入

可以看到，浏览器User-Agent为Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36

下面是详细的代码

        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36 Edg/96.0.1054.62'}url = 'https://www.shicimingju.com/book/%s.html'name_p = pinyin.pinyin(name)

我们先建立起一个数据表用来保存一会我们要爬取的数据

sql = "CREATE TABLE `novel`.`{}`(     `section` TEXT(100) ,     `article` TEXT(10000)   );".format(name)SQL.doSql(sql)

现在我们要开始爬取网页上我们需要的数据了，并且把爬取到的数据存入到我们所建立起的数据库里

for li in li_list:title = li.a.stringdetail_url = 'https://www.shicimingju.com'+li.a['href']detail_page_text = requests.get(url=detail_url,headers=headers)detail_page_text.encoding = 'utf-8'html = detail_page_textdetail_page_text = detail_page_text.textdetail_soup = BeautifulSoup(detail_page_text,'lxml')div_tag = detail_soup.find('div',class_='chapter_content')content = div_tag.textsql = "INSERT INTO `novel`.`{}`(`section`,`article`) VALUES ( \'{}\',\'{}\');".format(name,title,content)SQL.doSql(sql)print(title,'爬取成功！！！！ ')

最后附上我们的完整代码

import os
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import requests
import SQL
import pinyin
from bs4 import BeautifulSoup
from pymysql import *conn = connect(host='localhost',user='root',password='',db='novel',charset='utf8')
cs1 = conn.cursor()
# cur = connect.cursor()
# cur.execute()# 生成小说
def book(name):novel_path = os.path.join('./text', name)os.mkdir(novel_path)count = cs1.execute('SELECT section,article FROM `{}`;'.format(name))for i in range(count):result = cs1.fetchone()I = str(i)title_path = './text/' + name + '/' + I + '_' + result[0] + '.txt'with open(title_path, 'w', encoding='utf-8') as fp:fp.write(result[1])# text中有哪些小说
def path():import osdir_path = './text'for root, dirs, files in os.walk(dir_path):list = dirsbreakreturn list
# 词云
def wc(word,name):count = cs1.execute('SELECT section,article FROM `{}`;'.format(name))content_num = []for i in range(count):result = cs1.fetchone()words = jieba.lcut(result[1])for word in words:content_num.append(result[0])# content_num = str(content_num)cut_text = "".join(content_num)if cut_text == NULL:name = input('请重新输入关键词：')wc(word,name)wordcloud = WordCloud(font_path="C:/Windows/Fonts/simfang.ttf",background_color="white",width=1000,height=800).generate(cut_text)plt.imshow(wordcloud)plt.show()# 打开小说文件夹
def sel(name_r):path_r = '.\\text\\' + name_ros.startfile(path_r)if __name__ == "__main__":name = input('请输入小说名字：')count = cs1.execute('SHOW TABLES FROM novel;')content_list = []# 处理小说名字加入列表中for i in range(count):result = str(cs1.fetchone())result = result[2:-3]content_list.append(result)# 判断是否重名，重名打开小说，没重名爬下来if (name in content_list):for i in content_list:print(i)while True :name_r = input("选择您要读的书籍：")if (name_r in content_list):break;else:print("输入有误请重新输入")word = input("关键词：")wc(word,name_r)if (name_r in path()):sel(name_r)else:book(name)sel(name)else:headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36 Edg/96.0.1054.62'}url = 'https://www.shicimingju.com/book/%s.html'name_p = pinyin.pinyin(name)url = format(url%name_p)page_text = requests.get(url=url,headers=headers)page_text.encoding = 'utf-8'html = page_textpage_text = page_text.textsoup = BeautifulSoup(page_text, 'lxml')li_list = soup.select('.book-mulu > ul > li')sql = "CREATE TABLE `novel`.`{}`(     `section` TEXT(100) ,     `article` TEXT(10000)   );".format(name)SQL.doSql(sql)for li in li_list:title = li.a.stringdetail_url = 'https://www.shicimingju.com'+li.a['href']detail_page_text = requests.get(url=detail_url,headers=headers)detail_page_text.encoding = 'utf-8'html = detail_page_textdetail_page_text = detail_page_text.textdetail_soup = BeautifulSoup(detail_page_text,'lxml')div_tag = detail_soup.find('div',class_='chapter_content')content = div_tag.textsql = "INSERT INTO `novel`.`{}`(`section`,`article`) VALUES ( \'{}\',\'{}\');".format(name,title,content)SQL.doSql(sql)print(title,'爬取成功！！！！ ')print("$$$$$$爬取结束！！！$$$$$$")word = input("关键词：")wc(word, name)if (name in path()):sel(name)else:book(name)sel(name)cs1.close()
conn.close()

完整代码里还有一些其他的功能，有感兴趣的小伙伴可以自己研究研究！