
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
data = pd.read_csv('Womens_Clothing.csv')
#  查看数据结构
Unnamed: 0 Clothing ID Age Title Review Text Rating Recommended IND Positive Feedback Count Division Name Department Name Class Name
0 0 767 33 NaN Absolutely wonderful - silky and sexy and comf... 4 1 0 Initmates Intimate Intimates
1 1 1080 34 NaN Love this dress! it's sooo pretty. i happene... 5 1 4 General Dresses Dresses
2 2 1077 60 Some major design flaws I had such high hopes for this dress and reall... 3 0 0 General Dresses Dresses
3 3 1049 50 My favorite buy! I love, love, love this jumpsuit. it's fun, fl... 5 1 0 General Petite Bottoms Pants
4 4 847 47 Flattering shirt This shirt is very flattering to all due to th... 5 1 6 General Tops Blouses
... ... ... ... ... ... ... ... ... ... ... ...
23481 23481 1104 34 Great dress for many occasions I was very happy to snag this dress at such a ... 5 1 0 General Petite Dresses Dresses
23482 23482 862 48 Wish it was made of cotton It reminds me of maternity clothes. soft, stre... 3 1 0 General Petite Tops Knits
23483 23483 1104 31 Cute, but see through This fit well, but the top was very see throug... 3 0 1 General Petite Dresses Dresses
23484 23484 1084 28 Very cute dress, perfect for summer parties an... I bought this dress for a wedding i have this ... 3 1 2 General Dresses Dresses
23485 23485 1104 52 Please make more like this one! This dress in a lovely platinum is feminine an... 5 1 22 General Petite Dresses Dresses

23486 rows × 11 columns




中文名称 英文名称

服装ID Clothing ID

年龄 Age

标题 Title

评论文本 Review Text

评分: Rating

推荐的IND Recommended IND

积极的反馈计数 Positive Feedback Count

高级部门名称 Division Name

部门名称 Department Name

类名称 Class Name

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23486 entries, 0 to 23485
Data columns (total 11 columns):
Unnamed: 0                 23486 non-null int64
Clothing ID                23486 non-null int64
Age                        23486 non-null int64
Title                      19676 non-null object
Review Text                22641 non-null object
Rating                     23486 non-null int64
Recommended IND            23486 non-null int64
Positive Feedback Count    23486 non-null int64
Division Name              23472 non-null object
Department Name            23472 non-null object
Class Name                 23472 non-null object
dtypes: int64(6), object(5)
memory usage: 2.0+ MB
#  查看缺失值
# data.isnull()
#  删除缺失值
df = data.dropna()
Unnamed: 0 Clothing ID Age Title Review Text Rating Recommended IND Positive Feedback Count Division Name Department Name Class Name
2 2 1077 60 Some major design flaws I had such high hopes for this dress and reall... 3 0 0 General Dresses Dresses
3 3 1049 50 My favorite buy! I love, love, love this jumpsuit. it's fun, fl... 5 1 0 General Petite Bottoms Pants
4 4 847 47 Flattering shirt This shirt is very flattering to all due to th... 5 1 6 General Tops Blouses
5 5 1080 49 Not for the very petite I love tracy reese dresses, but this one is no... 2 0 4 General Dresses Dresses
6 6 858 39 Cagrcoal shimmer fun I aded this in my basket at hte last mintue to... 5 1 1 General Petite Tops Knits
... ... ... ... ... ... ... ... ... ... ... ...
23481 23481 1104 34 Great dress for many occasions I was very happy to snag this dress at such a ... 5 1 0 General Petite Dresses Dresses
23482 23482 862 48 Wish it was made of cotton It reminds me of maternity clothes. soft, stre... 3 1 0 General Petite Tops Knits
23483 23483 1104 31 Cute, but see through This fit well, but the top was very see throug... 3 0 1 General Petite Dresses Dresses
23484 23484 1084 28 Very cute dress, perfect for summer parties an... I bought this dress for a wedding i have this ... 3 1 2 General Dresses Dresses
23485 23485 1104 52 Please make more like this one! This dress in a lovely platinum is feminine an... 5 1 22 General Petite Dresses Dresses

19662 rows × 11 columns


# 1. 可视化 给出评分者的年龄
plt.hist(df['Age'], color=color[1], label='age')
plt.title('age of commentator')
print('\n figure 01')
 figure 01


由figure01 可得出:给出评论的人的年龄大多在25到45之间,青年、中年人较多

# 2. 可视化不同年龄的等级图
plt.figure(figsize=(10, 8))
sns.boxplot(x='Rating', y='Age', data=df)
plt.title('age of rating')
print('\n figure 02')
 figure 02


由figure02 可得出:给出评分分布的年龄都差不多

查看Division Name,Department Name和’Class Name的唯一值

print('高级部门Division Name', df['Division Name'].unique())
print('部门Department Name',df['Department Name'].unique())
print('类名称Class Name',df['Class Name'].unique())
高级部门Division Name ['General' 'General Petite' 'Initmates']部门Department Name ['Dresses' 'Bottoms' 'Tops' 'Intimate' 'Jackets' 'Trend']类名称Class Name ['Dresses' 'Pants' 'Blouses' 'Knits' 'Intimates' 'Outerwear' 'Lounge''Sweaters' 'Skirts' 'Fine gauge' 'Sleep' 'Jackets' 'Swim' 'Trend' 'Jeans''Shorts' 'Legwear' 'Layering' 'Casual bottoms' 'Chemises']

将Recommended IND推荐产品为1,不推荐0的数据分开

# recommend  not_recommend
recommend = df[df['Recommended IND'] == 1]
not_recommend = df[df['Recommended IND'] == 0]
# recommend.head()
Unnamed: 0 Clothing ID Age Title Review Text Rating Recommended IND Positive Feedback Count Division Name Department Name Class Name
2 2 1077 60 Some major design flaws I had such high hopes for this dress and reall... 3 0 0 General Dresses Dresses
5 5 1080 49 Not for the very petite I love tracy reese dresses, but this one is no... 2 0 4 General Dresses Dresses
10 10 1077 53 Dress looks like it's made of cheap material Dress runs small esp where the zipper area run... 3 0 14 General Dresses Dresses
22 22 1077 31 Not what it looks like First of all, this is not pullover styling. th... 2 0 7 General Dresses Dresses
25 25 697 31 Falls flat Loved the material, but i didnt really look at... 3 0 0 Initmates Intimate Lounge
# 4.可视化不同部门的推荐和不推荐的叠加柱状图
plt.hist(recommend['Department Name'], color=color[2], alpha=0.5, label='recommend')
plt.hist(not_recommend['Department Name'], color=color[4], alpha=0.5, label='not_recommend')
plt.title('Department recommend and not_recommend')
print('\n figure 03')
 figure 03


由figure03可知 绿色的面积大于X色的面积,由此说明,大部分部门都可以推荐商品

# 可视化不同商品的推荐和不推荐叠加柱状图
plt.hist(recommend['Class Name'], color=color[1], alpha=0.5, label='recommend')
plt.hist(not_recommend['Class Name'], color=color[5], alpha=0.5, label='not_recommend')
plt.title('Class recommend and not_recommend')
print('\n figure 04')
 figure 04



# 哪个年龄段的人对什么样的衣服发表什么样的评论
df['Review Length'] = df['Review Text'].astype(str).apply(len)
E:\anaconda\lib\site-packages\ipykernel_launcher.py:2: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value insteadSee the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

Unnamed: 0 Clothing ID Age Title Review Text Rating Recommended IND Positive Feedback Count Division Name Department Name Class Name Review Length
2 2 1077 60 Some major design flaws I had such high hopes for this dress and reall... 3 0 0 General Dresses Dresses 500
3 3 1049 50 My favorite buy! I love, love, love this jumpsuit. it's fun, fl... 5 1 0 General Petite Bottoms Pants 124
4 4 847 47 Flattering shirt This shirt is very flattering to all due to th... 5 1 6 General Tops Blouses 192
5 5 1080 49 Not for the very petite I love tracy reese dresses, but this one is no... 2 0 4 General Dresses Dresses 488
6 6 858 39 Cagrcoal shimmer fun I aded this in my basket at hte last mintue to... 5 1 1 General Petite Tops Knits 496
... ... ... ... ... ... ... ... ... ... ... ... ...
23481 23481 1104 34 Great dress for many occasions I was very happy to snag this dress at such a ... 5 1 0 General Petite Dresses Dresses 131
23482 23482 862 48 Wish it was made of cotton It reminds me of maternity clothes. soft, stre... 3 1 0 General Petite Tops Knits 223
23483 23483 1104 31 Cute, but see through This fit well, but the top was very see throug... 3 0 1 General Petite Dresses Dresses 208
23484 23484 1084 28 Very cute dress, perfect for summer parties an... I bought this dress for a wedding i have this ... 3 1 2 General Dresses Dresses 427
23485 23485 1104 52 Please make more like this one! This dress in a lovely platinum is feminine an... 5 1 22 General Petite Dresses Dresses 110

19662 rows × 12 columns

#  绘制单Review Length变量分布
# 单变量分布的最方便的方法是sns.distplot()功能。默认情况下,这将绘制直方图并拟合核密度估计(KDE)
fig = plt.figure(figsize=(12, 8))
ax = sns.distplot(df['Review Length'], color=color[3])
ax = plt.title("Length of Reviews")
print('\n figure 05')
 figure 05


由figure05可得出 大部分人评论的长度都基本在500

#  可视化不同年龄段的评论长度分布
sns.boxplot(x='Age', y='Review Length', data=df)
print('\n figure 06')
 figure 06

# 评分与正面反馈计数
sns.boxplot(x = 'Rating', y = 'Positive Feedback Count', data = df)
print('\n figure 07')
 figure 07


由图figure07可得出 评分在3以上的正面反馈的计数大


# 1. 数据清洗
import re
from wordcloud import WordCloud, STOPWORDSdef clean_data(text):letters_only = re.sub("[^a-zA-Z]", " ", text) #  替换标点符合等words = letters_only.lower().split()                            return( " ".join( words ))
#     return letters_onlystopwords= set(STOPWORDS)|{'skirt', 'blouse','dress','sweater', 'shirt','bottom', 'pant', 'pants' 'jean', 'jeans','jacket', 'top', 'dresse'}def create_cloud(rating):x= [i for i in rating]y= ' '.join(x)cloud = WordCloud(background_color='white',width=1600, height=800,max_words=100,stopwords= stopwords).generate(y)plt.figure(figsize=(15,7.5))plt.axis('off')plt.imshow(cloud)plt.show()
#  等级是5的词云图
rating5= df[df['Rating']==5]['Review Text'].apply(clean_data)

#  等级是4的词云图
rating4= df[df['Rating']==4]['Review Text'].apply(clean_data)

#  等级是3的词云图
rating3= df[df['Rating']==3]['Review Text'].apply(clean_data)

#  等级是2的词云图
rating2= df[df['Rating']==2]['Review Text'].apply(clean_data)

#  等级是1的词云图
rating1= df[df['Rating']==1]['Review Text'].apply(clean_data)


  1. 电商数据指标与《电商数据分析与数据化营销》

