基于Python 爬虫+简单数据分析附PPT

按照我们老师要求，用python做一个关于数据分析的小项目
——基于baidu的编程haha

我选的是·爬取豆瓣Top250 数据，然后分析豆瓣用户观影喜好

PPT部分截图：
ppt不知道怎么上传 =.= 需要留言

爬虫部分代码：

#-*- coding: utf-8 -*-
import io
import sys
from urllib.request import urlopen
from bs4 import BeautifulSoup
from collections import defaultdict
import pandas as pd
import time
import re
from multiprocessing import pool, Poolclass DoubanMovieTop():sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='gb18030')def __init__(self):# 得到url地址，分析分页规律，self.top_urls = ['https://movie.douban.com/top250?start={0}&filter='.format(x*25) for x in range(10)]self.data = defaultdict(list)self.columns = ['title', 'link', 'score', 'score_cnt', 'top_no', 'director', 'writers', 'actors', 'types','edit_location', 'language', 'dates', 'play_location', 'length', 'rating_per', 'betters','had_seen', 'want_see', 'tags', 'short_review', 'review', 'ask', 'discussion']self.df = Nonedef get_bsobj(self, url):html = urlopen(url).read().decode('utf-8')      #将字节转字符串bsobj = BeautifulSoup(html, 'lxml')return bsobjdef get_info(self):for url in self.top_urls:bsobj = self.get_bsobj(url)main = bsobj.find('ol', {'class': 'grid_view'})# 标题及链接信息title_objs = main.findAll('div', {'class': 'hd'})titles = [i.find('span').text for i in title_objs]links = [i.find('a')['href'] for i in title_objs]# 评分信息score_objs = main.findAll('div', {'class': 'star'})scores = [i.find('span', {'class': 'rating_num'}).text for i in score_objs]score_cnts = [i.findAll('span')[-1].text for i in score_objs]for title, link, score, score_cnt in zip(titles, links, scores, score_cnts):self.data[title].extend([title, link, score, score_cnt])bsobj_more = self.get_bsobj(link)more_data = self.get_more_info(bsobj_more)self.data[title].extend(more_data)print(self.data[title])print(len(self.data))time.sleep(0)def get_more_info(self, bsobj):# 榜单排名top_no = bsobj.find('span', {'class': 'top250-no'}).text.split('.')[1]# 更多信息main = bsobj.find('div', {'id': 'info'})# 导演dire_obj = main.findAll('a', {'rel': 'v:directedBy'})director = [i.text for i in dire_obj]# 编剧try:writer_obj = main.findAll('span', {'class': 'attrs'})[1]writers = [i.text for i in writer_obj.findAll('a')]except Exception as e:writers = []print(e)# 主演try:actor_obj = main.findAll('a', {'rel': 'v:starring'})actors = [i.text for i in actor_obj]except Exception as e:actors = []print(e)# 类型type_obj = main.findAll('span', {'property': 'v:genre'})types = [i.text for i in type_obj]# 制片地区pattern = re.compile('地区: (.*?)\n语言', re.S)edit_location = re.findall(pattern, main.text)[0]# 语言pattern2 = re.compile('语言: (.*?)\n上映日期')language = re.findall(pattern2, main.text)[0]# 上映日期/地区date_obj = main.findAll('span', {'property': 'v:initialReleaseDate'})dates = [i.text.split('(')[0][:4] for i in date_obj]play_location = [i.text.split('(')[1][:-1] for i in date_obj]# 片长length = main.find('span', {'property': 'v:runtime'})['content']# 5星到1星比例rating_obj = bsobj.findAll('span', {'class': 'rating_per'})rating_per = [i.text for i in rating_obj]# 好于better_obj = bsobj.find('div', {'class': 'rating_betterthan'})betters = [i.text for i in better_obj.findAll('a')]# 想看/看过watch_obj = bsobj.find('div', {'class': 'subject-others-interests-ft'})had_seen = watch_obj.find('a').text[:-3]want_see = watch_obj.findAll('a')[-1].text[:-3]# 标签tag_obj = bsobj.find('div', {'class': 'tags-body'}).findAll('a')tags = [i.text for i in tag_obj]# 短评short_obj = bsobj.find('div', {'id': 'comments-section'})short_review = short_obj.find('div').find('span', {'class': 'pl'}).find('a').text.split(' ')[1]# 影评review = bsobj.find('a', {'href': 'reviews'}).text.split(' ')[1]# 问题ask_obj = bsobj.find('div', {'id': 'askmatrix'})ask = ask_obj.find('h2').find('a').text.strip()[2:-1]# 讨论discuss_obj = bsobj.find('p', {'class': 'pl', 'align': 'right'}).find('a')discussion = discuss_obj.text.strip().split('（')[1][2:-2]more_data = [top_no, director, writers, actors, types, edit_location, language, dates, play_location,length, rating_per, betters, had_seen, want_see, tags, short_review, review, ask, discussion]return more_data#将数据缓存到本地def dump_data(self):data = []for title, value in self.data.items():data.append(value)self.df = pd.DataFrame(data, columns=self.columns)self.df.to_csv('douban_top250.csv', index=False)if __name__ == '__main__':douban = DoubanMovieTop()douban.get_info()douban.dump_data()

数据处理分析部分

#-*- coding: utf-8 -*-
import io
import json
import sys
from collections import Counter
from functools import reduce
from collections import defaultdict
import matplotlib.pyplot as plt
import numpy
import pandas as pdfrom math import log2
from math import sqrtdf = pd.read_csv('douban_top250.csv')
print(df.loc[5,'director'])
# 将字符串改为列表
df_tmp = df[['director', 'writers', 'actors', 'types', 'dates', 'play_location', 'rating_per', 'betters', 'tags']]
df[['director', 'writers', 'actors', 'types', 'dates', 'play_location', 'rating_per', 'betters', 'tags']] = df_tmp.applymap(lambda x: eval(x))
df['dates'] = df['dates'].map(lambda x: [int(i) for i in x])
df['year'] = df['dates'].map(lambda x: min(x))
# 去掉空格
df['director'] = df['director'].map(lambda x: [i.strip() for i in x])
# 评分人数：x人评价清洗为x，并调整为int类型
df['score_cnt'] = df['score_cnt'].map(lambda x: int(x[:-3]))
df.describe()['score_cnt']# 五星比例/好评比例
df['five_star_rate'] = df['rating_per'].map(lambda x: float(x[0][:-1])/100)
df['favor_rate'] = df['rating_per'].map(lambda x: (float(x[0][:-1]) + float(x[1][:-1]))/100)# 比百分之多少同类电影好
df['better_than'] = df['betters'].map(lambda x: sum([int(i.split('%')[0]) for i in x])/len(x))# 上映年份由字符串转换为int类型
# lambda()
#
#     将参数和方法放在一起的匿名函数
#
#             例如：g = lambda x:x+1
#             g(1)=2
#             x相当于参数1
#             x+1相当于方法
#             def(x):
#                 return(x+1)# ------------------------入榜电影数排行--未加权-----------------------
# reduce迭代获取所有导演的列表
director_list = reduce(lambda x, y: x + y, df.director)    # reduce累加
print(len(director_list))# 导演影片数量大于3部的有哪些导演
# Counter 统计出现次数‘
# items 以key value 字典展示
# reverse 反转排序
print(director_list)
dire_counter = Counter(director_list)    # 统计导演出现的次数，并结合为key:value  '宫崎骏': 7
dire_counter = sorted(dire_counter.items(), key=lambda x: x[1], reverse=True)
top_directors = list(filter(lambda x: x[1] >= 4, dire_counter))
print(top_directors)# 获得导演+作品数， 导演+评分
top_dire_score = defaultdict(list)
top_dire_ind = defaultdict(list)
for name, cnt in top_directors:    # iterrows遍历 返回一个元组（name cnt）for index, row in df.iterrows():if name in row['director']:top_dire_score[name].append(row['score'])top_dire_ind[name].append(row['top_no'])
print(top_dire_score)
print(top_dire_ind)rank_score = []
rank_ind = []
for name, scores in top_dire_score.items():rank_score.append([name, sum(scores) / len(scores) * sqrt(log2(len(scores)))])for name, indexes in top_dire_ind.items():rank_ind.append([name, sum(indexes) / sqrt(log2(len(scores))) / len(indexes)])rank_score = sorted(rank_score, key=lambda x: x[1], reverse=True)
rank_ind = sorted(rank_ind, key=lambda x: x[1])# reduce迭代获取所有演员的列表
actor_list = reduce(lambda x, y: x + y, df.actors)    # reduce累加
print(len(actor_list))
actor_counter = Counter(actor_list)    # 统计演员出现的次数，并结合为key:value  ('张国荣', 8)
actor_counter = sorted(actor_counter.items(), key=lambda x: x[1], reverse=True)
top_actors = list(filter(lambda x: x[1] >= 5, actor_counter))
plt.rcParams['font.sans-serif'] = ['SimHei']    # 用来正常显示中文标签
names = [i[0] for i in top_actors]
cnt = [i[1] for i in top_actors]
fig, ax = plt.subplots(figsize=(16, 8))
plt.barh(names, cnt, color='c')
ax.set_yticklabels(labels=names,fontdict={'x':0.5,'fontsize':24})
# 'top', 'bottom', 'center', 'baseline
plt.xlabel('上榜电影数', fontsize=20)
plt.title('豆瓣电影TOP250入榜最多演员情况', fontsize=35)
plt.show()# -----------------------------------加权得分榜------------------
from math import log2
from math import sqrt
# 获得演员+评分， 演员+作品数
top_actor_score = defaultdict(list)
top_actor_ind = defaultdict(list)
for name, cnt in top_actors:    # iterrows遍历 返回一个元组（name cnt）for index, row in df.iterrows():if name in row['actors']:top_actor_score[name].append(row['score'])    # 演员+分数top_actor_ind[name].append(row['top_no'])      # 演员+排名
print(top_actor_score)
print(top_actor_ind)
rank_score_a = []
rank_ind_a = []
for name, scores in top_actor_score.items():rank_score_a.append([name, sum(scores) / len(scores) * sqrt(log2(len(scores)))])for name, indexes in top_actor_ind.items():rank_ind_a.append([name, sum(indexes) / sqrt(log2(len(scores))) / len(indexes)])rank_score_a = sorted(rank_score_a, key=lambda x: x[1], reverse=True)
rank_ind_a = sorted(rank_ind_a, key=lambda x: x[1])
print(rank_score_a[:5])
print(rank_ind_a[:5])names = [i[0] for i in rank_score_a]
score = [i[1] for i in rank_score_a]
fig, ax = plt.subplots(figsize=(16, 8))
plt.barh(names, score, color='c')
ax.set_yticklabels(labels=names,fontdict={'y': 0.8,'fontsize': 24,})plt.xlabel('加权得分', fontsize=20)
plt.title('豆瓣电影TOP250演员加权得分榜', fontsize=24)
plt.show()
# --------------将国家和国家top250作品数 存为json 文件---------------------
# python字典转json对象并写入文件
python_dict = {}
fp = open('json_obj.json', 'w')
json_obj1 = json.dump(python_dict, fp, ensure_ascii=False)# --------------------豆瓣电影TOP250入榜国家或地区-------------------------from collections import Counter
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
import re
country_list1 = []
for i in range(len(df)):print(re.split('/', df.loc[i, 'edit_location']))# 用/分割每个国家 得到的是list[] 然后遍历每一个list 得到字符串。存到一个新的list中for i, val in enumerate(re.split('/', df.loc[i, 'edit_location'].replace(' ',''))):country_list1.append(val)country_counter = Counter(country_list1)    # 统计国家出现的次数，并结合为key:value
country_counter = sorted(country_counter.items(), key=lambda x: x[1], reverse=True)
top_countrys = list(filter(lambda x: x[1] >= 4, country_counter))# 入榜电影数排行
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
names = [i[0] for i in top_countrys]
cnt = [i[1] for i in top_countrys]
fig, ax = plt.subplots(figsize=(16, 8))
plt.barh(names, cnt, color='c')
ax.set_yticklabels(labels=names,fontdict={'x':0,'fontsize':20})
# 'top', 'bottom', 'center', 'baseline
plt.xlabel('上榜次数', fontsize=24)
plt.title('豆瓣电影TOP250入榜国家或地区', fontsize=35)
plt.show()# -------------------------生成词云对象-------------------from functools import reduce
from wordcloud import WordCloud
# 生成数据，每个词之间以空格分离
tags = reduce(lambda x, y: x + y, df.tags)
result = ' '.join(tags)# 设置词云属性，注意要指定中文字体
wc = WordCloud(font_path='simfang.ttf',     #字体background_color='white',   #背景颜色width=1000,height=500,max_font_size=100,            #字体大小min_font_size=10,max_words=100
)
wc.generate(result)
bg= np.array(Image.open("doubanbg.png"))
# 展示词云
plt.figure('豆瓣Top250电影标签', figsize=(16,8))
plt.imshow(wc)
plt.axis('off')
plt.show()# -------------------豆瓣用户喜欢的电影类型  饼图-------------------------------------from collections import Counter
from functools import reduce
import matplotlib.pyplot as plt# 去掉空格
df['types'] = df['types'].map(lambda x: [i.strip() for i in x])types_list = reduce(lambda x, y: x + y, df.types)    # reduce累加
type_counter = Counter(types_list)
type_counter = sorted(type_counter.items(), key=lambda x: x[1], reverse=False)
top_type_counter = list(filter(lambda x: x[1] >= 30, type_counter))names = [i[0] for i in top_type_counter]
cnt = [i[1] for i in top_type_counter]
type_len = len(types_list)
print(names[:-1])
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
plt.title('豆瓣用户喜欢的电影类型',fontsize=20)
# 去除剧情部分
plt.pie(cnt[:-1],labels=names[:-1],shadow=True,autopct='%.2f%%')
plt.show()# -------------------TOP250年份分布 条形图-----------------------------------from functools import reduce
import pandas as pd
import matplotlib.pyplot as plt
# 条形图主要用于展示分类数据，而直方图则主要用于展示数据型数据
df = pd.read_csv('douban_top250.csv')
df_tmp = df[['director', 'writers', 'actors', 'types', 'dates', 'play_location', 'rating_per', 'betters', 'tags']]
df[['director', 'writers', 'actors', 'types', 'dates', 'play_location', 'rating_per', 'betters', 'tags']] = df_tmp.applymap(lambda x: eval(x))
df['dates'] = df['dates'].map(lambda x: min(x))
df['score_cnt'] = df['score_cnt'].map(lambda x: int(x[:-3]))
# 得到year的list  ['1994', '1993', '1994']
years_list = []
for i in range(len(df.dates)):years_list.append(df.dates[i])
years_list.sort(reverse=False)
# print(min(years_list)+" "+max(years_list))  -----int()
# for i in range(len(years_list)):# int(years_list[i])
years_list = list(map(int, years_list))    # 字符串list 转int
print(years_list)
group = [1930,1940,1950,1960,1970,1980,1990,2000,2010,2020]
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签plt.hist(years_list, group,histtype='bar',rwidth=0.9)
plt.ylabel('上榜电影数')
plt.xlabel('电影上映年份')
plt.title('豆瓣用户喜欢的电影TOP250年份分布',fontsize=20)
plt.legend(['上榜电影数'])
plt.show()# -----------------------评论人数和已看人数之间的相关性--------------------------------------from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
fig, ax = plt.subplots(figsize=(14, 8))
lr = LinearRegression(fit_intercept=True)
X = df['score_cnt'].values
y = df['had_seen'].values
X.shape = (len(X), 1)
# 将数据拆分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y)
# 训练模型
lr.fit(X_train, y_train)
# 预测
y_pred = lr.predict(X_test)
plt.ylabel('评论人数')
plt.xlabel('已看人数')
plt.title('评论人数和已看人数之间的关系',fontsize=20)# 画图
plt.scatter(X_test, y_test, color='green', marker='+')
plt.plot(X_test, y_pred, '-', color='red')
plt.show()

PPT和源码：

基于Python 爬虫+简单数据分析附PPT相关推荐

基于 Python 爬虫+简单数据分析的课程设计(附PPT)
按照课程设计要求,要用python做一个关于数据分析的小项目,在这里我们选的是爬取豆瓣图书TOP250,其中用到一些常用的数据处理及可视化方法,比如Echarts.Flask等! PPT部分截图: 报 ...
基于python爬虫与数据分析系统设计
百度网盘下载地址(957):点击下载本文使用Python编写爬虫,通过向端口传送请求并且抓取传输过来的json字符串来获取招聘职位信息,并且分类保存为csv格式的表格文件.最后通过长时间的爬取,最终 ...
python识别ppt文件格式 ——（专栏：基于python编写简单office阅卷程序③）
● 研二在读学生,非工科非计算机专业,故代码简陋初级勿喷,本文仅为记录和快乐分享. ○ 感谢肯定,感谢点赞收藏分享,转载请注明本页出处即可. ____Ⓙ即刻@王昭没有君本文仅为笔者摸索总结-欢迎订正 ...
【大数据分析毕设之基于python爬虫的旅游大数据分析可视化系统】
[大数据分析毕设之基于python爬虫的旅游大数据分析可视化系统-哔哩哔哩] https://b23.tv/z2OUTkp flask web框架,数据使用selenium模块爬取携程网获取数据,使用 ...
【大数据分析专业毕设之基于python爬虫的电影票房大数据预测分析+大屏可视化分析
[大数据分析专业毕设之基于python爬虫的电影票房大数据预测分析+大屏可视化分析-哔哩哔哩https://b23.tv/saIKtBH flask web框架,数据使用requests模块爬取数据, ...
Python爬虫与数据分析
Python爬虫与数据分析目的爬取网易云音乐歌曲热评,分析热评特征. 思路 (1)爬取华语歌单中所有歌单url (2)从每篇歌单地址中爬取每首歌的url (3)从每首歌的首页爬取热评代码 (1) ...
基于python爬虫数据处理_基于Python爬虫的校园数据获取
苏艺航徐海蛟何佳蕾杨振宇王佳鹏摘要:随着移动时代的到来,只适配了电脑网页.性能羸弱的校园教务系统,已经不能满足学生们的移动查询需求.为此,设计了一种基于网络爬虫的高实用性查询系統.它首先通过 ...
python毕业设计开题报告-基于python爬虫的影评情感分析研究开题报告
论文(设计)题目基于python爬虫的影评情感分析研究开题报告选题的背景.意义及研究现状: 研究现状: 文本情感分析又称倾向性分析.情感挖掘,主观分析或评论挖掘,是对带有情感色彩的评论文本内容进行 ...
python画球鞋_基于Python爬虫原理的篮球鞋选择程序的设计与实现
基于 Python 爬虫原理的篮球鞋选择程序的设计与实现张世元 [期刊名称] <通讯世界> [年 ( 卷 ), 期] 2019(026)002 [摘要] 伴随着篮球鞋工艺的进步及产业升级 ...

基于Python 爬虫+简单数据分析附PPT

基于Python 爬虫+简单数据分析附PPT相关推荐

最新文章

热门文章

基于Python 爬虫+简单数据分析 附PPT

基于Python 爬虫+简单数据分析 附PPT相关推荐

最新文章

热门文章

基于Python 爬虫+简单数据分析附PPT

基于Python 爬虫+简单数据分析附PPT相关推荐