吴恩达机器学习课程：编程练习 | (8) ex8-anomaly detection and recommendation

1. 异常检测

"""
案例1: 检测异常服务器
数据集：data/ex8data1.mat
注：算法手动实现
"""import numpy as np
import scipy.io as sio
import matplotlib.pyplot as pltdef estimate_gaussian(X, isCovariance):  # 计算均值与协方差矩阵means = np.mean(X, axis=0)if isCovariance:sigma2 = (X - means).T @ (X - means) / len(X)  # 主对角线上的数值是方差，其他是协方差，也可用sigma=np.cov(X.T)else:sigma2 = np.var(X, axis=0)  # np.var计算方差return means, sigma2def gaussian_distribution(X, means, sigma2):if np.ndim(sigma2) == 1:  # 数组维度是2，为原高斯分布模型sigma2 = np.diag(sigma2)X = X - meansn = X.shape[1]first = np.power(2 * np.pi, -n / 2) * (np.linalg.det(sigma2) ** (-0.5))second = np.diag(X @ np.linalg.inv(sigma2) @ X.T)p = first * np.exp(-0.5 * second)p = p.reshape(-1, 1) # 转为(n,1)return pdef plotGaussian(X, means, sigma2):plt.figure()x = np.arange(0, 30, 0.5)y = np.arange(0, 30, 0.5)xx, yy = np.meshgrid(x, y)z = gaussian_distribution(np.c_[xx.ravel(), yy.ravel()], means, sigma2)  # 计算对应的高斯分布函数zz = z.reshape(xx.shape)plt.plot(X[:, 0], X[:, 1], 'bx')contour_levels = [10 ** h for h in range(-20, 0, 3)]plt.contour(xx, yy, zz, contour_levels)def select_threshold(yval, p):bestEpsilon = 0bestF1 = 0epsilons = np.linspace(min(p), max(p), 1000)for e in epsilons:p_ = p < etp = np.sum((yval == 1) & (p_ == 1))fp = np.sum((yval == 0) & (p_ == 1))fn = np.sum((yval == 1) & (p_ == 0))prec = tp / (tp + fp) if (tp + fp) else 0  # precision 精确度rec = tp / (tp + fn) if (tp + fn) else 0   # recall召回率F1_e = 2 * prec * rec / (prec + rec) if (prec + rec) else 0 # F1scoreif F1_e > bestF1:bestF1 = F1_ebestEpsilon = ereturn bestEpsilon, bestF1if __name__ == '__main__':mat = sio.loadmat('data/ex8data1.mat')  # dict_keys(['__header__', '__version__', '__globals__', 'X', 'Xval', 'yval'])X = mat.get('X')  # X-->(307,2)X_val, y_val= mat['Xval'], mat['yval']# 原高斯分布模型means, sigma2 = estimate_gaussian(X, isCovariance=False)plotGaussian(X, means, sigma2)# 多元高斯分布模型means, sigma2 = estimate_gaussian(X, isCovariance=True)plotGaussian(X, means, sigma2)# 通过交叉验证集选取阈值εp_val = gaussian_distribution(X_val, means, sigma2)bestEpsilon, bestF1 = select_threshold(y_val, p_val)print(bestEpsilon, bestF1)p = gaussian_distribution(X, means, sigma2)anoms = np.array([X[i] for i in range(X.shape[0]) if p[i] < bestEpsilon])print(len(anoms))print('sss',anoms)plotGaussian(X, means, sigma2)plt.scatter(anoms[:, 0], anoms[:, 1], c='r', marker='o')# plt.plot(X[:, 0], X[:, 1], 'bx')plt.show()# -------------案例2: 高维数据的异常检测  数据集：data/ex8data1.mat-------------------mat_h = sio.loadmat('data/ex8data2.mat') # dict_keys(['__header__', '__version__', '__globals__', 'X', 'Xval', 'yval'])X2 = mat_h['X']X_val2, y_val2 = mat_h['Xval'], mat_h['yval']print(X2.shape,X_val2.shape,y_val2.shape)means_h,sigma2_h = estimate_gaussian(X2, isCovariance=True)pval = gaussian_distribution(X_val2, means_h, sigma2_h)bestEpsilon_h, bestF1_h = select_threshold(y_val2, pval)p_h = gaussian_distribution(X2,means_h,sigma2_h)anoms_h = np.array([X2[i] for i in range(X2.shape[0]) if p_h[i] < bestEpsilon_h])print(len(anoms_h))

2. 01-异常检测01

"""
案例1: 检测异常服务器
数据集：data/ex8data1.mat
注：多元高斯分布模型算法借助其他库(scipy)来实现
"""import numpy as np
import pandas as pd
import scipy.io as sio
import matplotlib.pyplot as plt
import seaborn as snssns.set(context="notebook", style="white", palette=sns.color_palette("RdBu"))
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_reportdef select_threshold(X, X_val, y_val):"""use CV data to find the best epsilonReturns:e: best epsilon with the highest f-scoref-score: such best f-score"""# create multivariate model using training datamu = X.mean(axis=0)cov = np.cov(X.T)multi_normal = stats.multivariate_normal(mu, cov)# this is key, use CV data for fine tuning hyper parameterspval = multi_normal.pdf(X_val)# set up epsilon candidatesepsilon = np.linspace(np.min(pval), np.max(pval), num=10000)# calculate f-scorefs = []for e in epsilon:y_pred = (pval <= e).astype('int')fs.append(f1_score(y_val, y_pred))# find the best f-scoreargmax_fs = np.argmax(fs)return epsilon[argmax_fs], fs[argmax_fs]def predict(X, X_val, e, X_test, y_test):"""with optimal epsilon, combine X, Xval and predict XtestReturns:multi_normal: multivariate normal modely_pred: prediction of test data"""Xdata = np.concatenate((X, X_val), axis=0)mu = Xdata.mean(axis=0)cov = np.cov(Xdata.T)multi_normal = stats.multivariate_normal(mu, cov)# calculate probability of test datapval = multi_normal.pdf(X_test)y_pred = (pval <= e).astype('int')print(classification_report(y_test, y_pred))return multi_normal, y_predif __name__ == '__main__':mat = sio.loadmat('data/ex8data1.mat')  # dict_keys(['__header__', '__version__', '__globals__', 'X', 'Xval', 'yval'])X = mat.get('X')  # X-->(307,2)# 该方法与前一种方法，在数据有区别，前一种没有划分测试集数据，该方法划分出测试集数据，X_val, X_test, y_val, y_test = train_test_split(mat.get('Xval'), mat.get('yval').ravel(), test_size=0.5)e, fs = select_threshold(X, X_val, y_val)print('Best epsilon: {}\nBest F-score on validation data: {}'.format(e, fs))multi_normal, y_pred = predict(X, X_val, e, X_test, y_test)# create a gridx, y = np.mgrid[0:30:0.5, 0:30:0.5]  # x->(3000,3000),y->(3000,3000)pos = np.dstack((x, y))print(pos.shape)fig, ax = plt.subplots()# plot probability densityax.contourf(x, y, multi_normal.pdf(pos), cmap='Blues')# plot original data pointssns.regplot('Latency', 'Throughput',data=pd.DataFrame(X, columns=['Latency', 'Throughput']),fit_reg=False,ax=ax,scatter_kws={"s": 10,"alpha": 0.4})# construct test DataFramedata = pd.DataFrame(X_test, columns=['Latency', 'Throughput'])data['y_pred'] = y_pred# mark the predicted anamoly of CV data. We should have a test set for this...anamoly_data = data[data['y_pred'] == 1]ax.scatter(anamoly_data['Latency'], anamoly_data['Throughput'], marker='x', s=50, c='r')print(anamoly_data.shape)plt.show()# -----------------------------------high dimension data-------------------------------mat_h = sio.loadmat('./data/ex8data2.mat')X_h = mat_h.get('X')Xval, Xtest, yval, ytest = train_test_split(mat_h.get('Xval'), mat_h.get('yval').ravel(), test_size=0.5)e_h, fs_h = select_threshold(X_h, Xval, yval)print('Best epsilon: {}\nBest F-score on validation data: {}'.format(e_h, fs_h))multi_normal_h, y_pred_h = predict(X_h, Xval, e_h, Xtest, ytest)print('find {} anamolies'.format(y_pred_h.sum())) # 由于用了随机的分割数据，每次运行，结果可能都不相同

3. 推荐系统

"""
案例：给用户推荐电影
"""import matplotlib.pyplot as plt
import seaborn as snssns.set(context="notebook", style="white", palette=sns.color_palette("RdBu"))
import numpy as np
import pandas as pd
import scipy.io as sio
import scipy.optimize as optdef serialize(X, theta):"""serialize 2 matrix"""# X (movie, feature), (1682, 10): movie features# theta (user, feature), (943, 10): user preferencereturn np.append(X.flatten(), theta.flatten())def deserialize(param, n_movie, n_user, n_features):"""into ndarray of X(1682, 10), theta(943, 10)"""return param[:n_movie * n_features].reshape(n_movie, n_features), \param[n_movie * n_features:].reshape(n_user, n_features)# recommendation fn
def cost_function(param, Y, R, n_features):"""compute cost for every r(i, j)=1Args:param: serialized X, thetaY (movie, user), (1682, 943): (movie, user) ratingR (movie, user), (1682, 943): (movie, user) has rating"""# theta (user, feature), (943, 10): user preference# X (movie, feature), (1682, 10): movie featuresn_movie, n_user = Y.shapeX, theta = deserialize(param, n_movie, n_user, n_features)inner = np.multiply(X @ theta.T - Y, R)return np.power(inner, 2).sum() / 2def gradient(param, Y, R, n_features):# theta (user, feature), (943, 10): user preference# X (movie, feature), (1682, 10): movie featuresn_movies, n_user = Y.shapeX, theta = deserialize(param, n_movies, n_user, n_features)inner = np.multiply(X @ theta.T - Y, R)  # (1682, 943)# X_grad (1682, 10)X_grad = inner @ theta# theta_grad (943, 10)theta_grad = inner.T @ X# roll them together and returnreturn serialize(X_grad, theta_grad)def regularized_cost(param, Y, R, n_features, λ=1):reg_term = np.power(param, 2).sum() * (λ / 2)return cost_function(param, Y, R, n_features) + reg_termdef regularized_gradient(param, Y, R, n_features, λ=1):grad = gradient(param, Y, R, n_features)reg_term = λ * paramreturn grad + reg_termdef normalizeRatings(Y, R):  # 均值归一化，见算法见笔记16.6Y_mean = (Y.sum(axis=1) / R.sum(axis=1)).reshape(-1, 1)Y_norm = (Y - Y_mean) * Rreturn Y_norm, Y_meanif __name__ == '__main__':movies_mat = sio.loadmat('./data/ex8_movies.mat')  # dict_keys(['__header__', '__version__', '__globals__', 'Y', 'R'])# Y->(1682,943)(电影数×用户数)用户电影评分矩阵(数组)# R->(1682,943) 电影评分的二进制值的"指示符"数据，例如[5 4 0 0 3 0]对应[1 1 0 1 0]Y, R = movies_mat.get('Y'), movies_mat.get('R')Y_plot = Y# param_mat-->dict_keys(['__header__', '__version__', '__globals__', 'X', 'Theta', 'num_users', 'num_movies',# 'num_features'])param_mat = sio.loadmat('./data/ex8_movieParams.mat')theta, X = param_mat.get('Theta'), param_mat.get('X')  # theta->(943,10) 10个特征数量 X->(1682,10) 电影特征(偏好程度)矩阵# -------------选取一小段数据，计算代价函数-------------users = 4movies = 5features = 3X_sub = X[:movies, :features]theta_sub = theta[:users, :features]Y_sub = Y[:movies, :users]R_sub = R[:movies, :users]param_sub = serialize(X_sub, theta_sub)cost_sub = cost_function(param_sub, Y_sub, R_sub, features)  # λ=0时的regularized_cost即cost_function函数print("小段数据代价函初始值（λ=0）：{}".format(cost_sub))cost_sub1 = regularized_cost(param_sub, Y_sub, R_sub, features, λ=1.5)print("小段数据代价函初始值（λ=1.5）：{}".format(cost_sub1))# -------------整个数据，计算的代价函数-----------------param = serialize(X, theta)  # total real paramsprint(param.shape)cost_real = cost_function(serialize(X, theta), Y, R, 10)  # this is real total costprint("所有数据代价函初始值（λ=0）：{}".format(cost_real))cost_real_1 = regularized_cost(serialize(X, theta), Y, R, 10, λ=1)print("所有数据代价函初始值（λ=1）：{}".format(cost_real_1))# 添加一个新的用户，根据该用户偏好自动推荐电影nm = int(param_mat['num_movies'])  # 电影数量nm=1682new_ratings = np.zeros((nm, 1))new_ratings[0] = 4new_ratings[6] = 3new_ratings[11] = 5new_ratings[53] = 4new_ratings[63] = 5new_ratings[65] = 3new_ratings[68] = 5new_ratings[97] = 2new_ratings[182] = 4new_ratings[225] = 5new_ratings[354] = 5Y = np.c_[Y, new_ratings]  # now I become user 0,在Y最后插入一列，insert貌似只能插入相同数值的列。Y->(1682,944)R = np.c_[R, new_ratings != 0]  # R->(1682,944)n_movie, n_user = Y.shape  # 电影总数量1682，总用户数量944n_features = int(param_mat['num_features'])  # n_features=10X_normal = np.random.standard_normal((n_movie, n_features))  # 把数据转换成高斯分布theta_normal = np.random.standard_normal((n_user, n_features))param_normal = serialize(X_normal, theta_normal)# 均值归一化Y_norm, Y_mean = normalizeRatings(Y, R)  # Y_mean：电影评分的平均值均，Y_norm：均值归一化结果λ = 10res = opt.minimize(fun=regularized_cost, x0=param_normal, args=(Y_norm, R, n_features, λ),method='TNC', jac=regularized_gradient)print(res)X_trained, theta_trained = deserialize(res.x, n_movie, n_user, n_features)print(X_trained.shape, theta_trained.shape)Y_pred = X_trained @ theta_trained.Ty_pred = Y_pred[:, -1] + Y_mean.flatten()  # 还原会类似于原始电影评分的数值结果index = np.argsort(y_pred)  # 将y_pred中的元素从小到大排序后，提取对应原数组的索引indexprint(index)index_reverse = np.flipud(index)  # 数组反转movies = []with open('data/movie_ids.txt', 'r', encoding='latin 1') as f:for line in f:tokens = line.strip().split(' ')movies.append(' '.join(tokens[1:]))print(len(movies))# 推荐用户喜好评分较高的电影for i in range(10):print(index_reverse[i], movies[index_reverse[i]], y_pred[index_reverse[i]])# 通过将矩阵渲染成图像来尝试“可视化”数据，我们不能从这里收集太多，但它确实给我们了解用户和电影的相对密度fig, ax = plt.subplots(figsize=(8, 8))ax.imshow(Y, cmap=plt.cm.hot)ax.set_xlabel('Users')ax.set_ylabel('Movies')# ax = sns.heatmap(Y)fig.tight_layout()plt.show()

吴恩达机器学习课程：编程练习 | (8) ex8-anomaly detection and recommendation相关推荐

8. 吴恩达机器学习课程-作业8-异常检测和推荐系统
fork了别人的项目,自己重新填写,我的代码如下 https://gitee.com/fakerlove/machine-learning/tree/master/code 代码原链接文章目录 8. ...
7. 吴恩达机器学习课程-作业7-Kmeans and PCA
fork了别人的项目,自己重新填写,我的代码如下 https://gitee.com/fakerlove/machine-learning/tree/master/code 代码原链接文章目录 7. ...
吴恩达机器学习课程笔记一
吴恩达机器学习课程笔记前言监督学习---`Supervised learning` 无监督学习---`Unsupervised learning` 聚类异常检测降维增强学习---`Reinf ...
吴恩达机器学习课程（第一周）
吴恩达机器学习课程(第一周) welcome Welcome to Machine learning!(video) 机器学习在各领域的应用很多比如搜索引擎图像识别垃圾邮件处理这是一门让计算机 ...
github标星11600+：最全的吴恩达机器学习课程资源（完整笔记、中英文字幕视频、python作业，提供百度云镜像！）...
吴恩达老师的机器学习课程,可以说是机器学习入门的第一课和最热门课程,我在github开源了吴恩达机器学习个人笔记,用python复现了课程作业,成为热门项目,star数达到11671+,曾经有相关报道 ...
6. 吴恩达机器学习课程-作业6-SVM
fork了别人的项目,自己重新填写,我的代码如下 https://gitee.com/fakerlove/machine-learning/tree/master/code 代码原链接文章目录 6. ...
5. 吴恩达机器学习课程-作业5-偏差和方差
fork了别人的项目,自己重新填写,我的代码如下 https://gitee.com/fakerlove/machine-learning/tree/master/code 代码原链接文章目录 5. ...
4. 吴恩达机器学习课程-作业4-神经网络学习
fork了别人的项目,自己重新填写,我的代码如下 https://gitee.com/fakerlove/machine-learning/tree/master/code 代码原链接文章目录 4. ...
3. 吴恩达机器学习课程-作业3-多分类和神经网络
fork了别人的项目,自己重新填写,我的代码如下 https://gitee.com/fakerlove/machine-learning/tree/master/code 代码原链接文章目录 3. ...
2.吴恩达机器学习课程-作业2-逻辑回归
fork了别人的项目,自己重新填写,我的代码如下 https://gitee.com/fakerlove/machine-learning/tree/master/code 代码原链接文章目录 2. ...

吴恩达机器学习课程：编程练习 | (8) ex8-anomaly detection and recommendation

吴恩达机器学习课程：编程练习 | (8) ex8-anomaly detection and recommendation相关推荐

最新文章

热门文章