1. 异常检测

"""
案例1: 检测异常服务器
数据集:data/ex8data1.mat
注:算法手动实现
"""import numpy as np
import scipy.io as sio
import matplotlib.pyplot as pltdef estimate_gaussian(X, isCovariance):  # 计算均值与协方差矩阵means = np.mean(X, axis=0)if isCovariance:sigma2 = (X - means).T @ (X - means) / len(X)  # 主对角线上的数值是方差,其他是协方差,也可用sigma=np.cov(X.T)else:sigma2 = np.var(X, axis=0)  # np.var计算方差return means, sigma2def gaussian_distribution(X, means, sigma2):if np.ndim(sigma2) == 1:  # 数组维度是2,为原高斯分布模型sigma2 = np.diag(sigma2)X = X - meansn = X.shape[1]first = np.power(2 * np.pi, -n / 2) * (np.linalg.det(sigma2) ** (-0.5))second = np.diag(X @ np.linalg.inv(sigma2) @ X.T)p = first * np.exp(-0.5 * second)p = p.reshape(-1, 1) # 转为(n,1)return pdef plotGaussian(X, means, sigma2):plt.figure()x = np.arange(0, 30, 0.5)y = np.arange(0, 30, 0.5)xx, yy = np.meshgrid(x, y)z = gaussian_distribution(np.c_[xx.ravel(), yy.ravel()], means, sigma2)  # 计算对应的高斯分布函数zz = z.reshape(xx.shape)plt.plot(X[:, 0], X[:, 1], 'bx')contour_levels = [10 ** h for h in range(-20, 0, 3)]plt.contour(xx, yy, zz, contour_levels)def select_threshold(yval, p):bestEpsilon = 0bestF1 = 0epsilons = np.linspace(min(p), max(p), 1000)for e in epsilons:p_ = p < etp = np.sum((yval == 1) & (p_ == 1))fp = np.sum((yval == 0) & (p_ == 1))fn = np.sum((yval == 1) & (p_ == 0))prec = tp / (tp + fp) if (tp + fp) else 0  # precision 精确度rec = tp / (tp + fn) if (tp + fn) else 0   # recall召回率F1_e = 2 * prec * rec / (prec + rec) if (prec + rec) else 0 # F1scoreif F1_e > bestF1:bestF1 = F1_ebestEpsilon = ereturn bestEpsilon, bestF1if __name__ == '__main__':mat = sio.loadmat('data/ex8data1.mat')  # dict_keys(['__header__', '__version__', '__globals__', 'X', 'Xval', 'yval'])X = mat.get('X')  # X-->(307,2)X_val, y_val= mat['Xval'], mat['yval']# 原高斯分布模型means, sigma2 = estimate_gaussian(X, isCovariance=False)plotGaussian(X, means, sigma2)# 多元高斯分布模型means, sigma2 = estimate_gaussian(X, isCovariance=True)plotGaussian(X, means, sigma2)# 通过交叉验证集选取阈值εp_val = gaussian_distribution(X_val, means, sigma2)bestEpsilon, bestF1 = select_threshold(y_val, p_val)print(bestEpsilon, bestF1)p = gaussian_distribution(X, means, sigma2)anoms = np.array([X[i] for i in range(X.shape[0]) if p[i] < bestEpsilon])print(len(anoms))print('sss',anoms)plotGaussian(X, means, sigma2)plt.scatter(anoms[:, 0], anoms[:, 1], c='r', marker='o')# plt.plot(X[:, 0], X[:, 1], 'bx')plt.show()# -------------案例2: 高维数据的异常检测  数据集:data/ex8data1.mat-------------------mat_h = sio.loadmat('data/ex8data2.mat') # dict_keys(['__header__', '__version__', '__globals__', 'X', 'Xval', 'yval'])X2 = mat_h['X']X_val2, y_val2 = mat_h['Xval'], mat_h['yval']print(X2.shape,X_val2.shape,y_val2.shape)means_h,sigma2_h = estimate_gaussian(X2, isCovariance=True)pval = gaussian_distribution(X_val2, means_h, sigma2_h)bestEpsilon_h, bestF1_h = select_threshold(y_val2, pval)p_h = gaussian_distribution(X2,means_h,sigma2_h)anoms_h = np.array([X2[i] for i in range(X2.shape[0]) if p_h[i] < bestEpsilon_h])print(len(anoms_h))

2. 01-异常检测01

"""
案例1: 检测异常服务器
数据集:data/ex8data1.mat
注:多元高斯分布模型算法借助其他库(scipy)来实现
"""import numpy as np
import pandas as pd
import scipy.io as sio
import matplotlib.pyplot as plt
import seaborn as snssns.set(context="notebook", style="white", palette=sns.color_palette("RdBu"))
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_reportdef select_threshold(X, X_val, y_val):"""use CV data to find the best epsilonReturns:e: best epsilon with the highest f-scoref-score: such best f-score"""# create multivariate model using training datamu = X.mean(axis=0)cov = np.cov(X.T)multi_normal = stats.multivariate_normal(mu, cov)# this is key, use CV data for fine tuning hyper parameterspval = multi_normal.pdf(X_val)# set up epsilon candidatesepsilon = np.linspace(np.min(pval), np.max(pval), num=10000)# calculate f-scorefs = []for e in epsilon:y_pred = (pval <= e).astype('int')fs.append(f1_score(y_val, y_pred))# find the best f-scoreargmax_fs = np.argmax(fs)return epsilon[argmax_fs], fs[argmax_fs]def predict(X, X_val, e, X_test, y_test):"""with optimal epsilon, combine X, Xval and predict XtestReturns:multi_normal: multivariate normal modely_pred: prediction of test data"""Xdata = np.concatenate((X, X_val), axis=0)mu = Xdata.mean(axis=0)cov = np.cov(Xdata.T)multi_normal = stats.multivariate_normal(mu, cov)# calculate probability of test datapval = multi_normal.pdf(X_test)y_pred = (pval <= e).astype('int')print(classification_report(y_test, y_pred))return multi_normal, y_predif __name__ == '__main__':mat = sio.loadmat('data/ex8data1.mat')  # dict_keys(['__header__', '__version__', '__globals__', 'X', 'Xval', 'yval'])X = mat.get('X')  # X-->(307,2)# 该方法与前一种方法,在数据有区别,前一种没有划分测试集数据,该方法划分出测试集数据,X_val, X_test, y_val, y_test = train_test_split(mat.get('Xval'), mat.get('yval').ravel(), test_size=0.5)e, fs = select_threshold(X, X_val, y_val)print('Best epsilon: {}\nBest F-score on validation data: {}'.format(e, fs))multi_normal, y_pred = predict(X, X_val, e, X_test, y_test)# create a gridx, y = np.mgrid[0:30:0.5, 0:30:0.5]  # x->(3000,3000),y->(3000,3000)pos = np.dstack((x, y))print(pos.shape)fig, ax = plt.subplots()# plot probability densityax.contourf(x, y, multi_normal.pdf(pos), cmap='Blues')# plot original data pointssns.regplot('Latency', 'Throughput',data=pd.DataFrame(X, columns=['Latency', 'Throughput']),fit_reg=False,ax=ax,scatter_kws={"s": 10,"alpha": 0.4})# construct test DataFramedata = pd.DataFrame(X_test, columns=['Latency', 'Throughput'])data['y_pred'] = y_pred# mark the predicted anamoly of CV data. We should have a test set for this...anamoly_data = data[data['y_pred'] == 1]ax.scatter(anamoly_data['Latency'], anamoly_data['Throughput'], marker='x', s=50, c='r')print(anamoly_data.shape)plt.show()# -----------------------------------high dimension data-------------------------------mat_h = sio.loadmat('./data/ex8data2.mat')X_h = mat_h.get('X')Xval, Xtest, yval, ytest = train_test_split(mat_h.get('Xval'), mat_h.get('yval').ravel(), test_size=0.5)e_h, fs_h = select_threshold(X_h, Xval, yval)print('Best epsilon: {}\nBest F-score on validation data: {}'.format(e_h, fs_h))multi_normal_h, y_pred_h = predict(X_h, Xval, e_h, Xtest, ytest)print('find {} anamolies'.format(y_pred_h.sum())) # 由于用了随机的分割数据,每次运行,结果可能都不相同

3. 推荐系统

"""
案例:给用户推荐电影
"""import matplotlib.pyplot as plt
import seaborn as snssns.set(context="notebook", style="white", palette=sns.color_palette("RdBu"))
import numpy as np
import pandas as pd
import scipy.io as sio
import scipy.optimize as optdef serialize(X, theta):"""serialize 2 matrix"""# X (movie, feature), (1682, 10): movie features# theta (user, feature), (943, 10): user preferencereturn np.append(X.flatten(), theta.flatten())def deserialize(param, n_movie, n_user, n_features):"""into ndarray of X(1682, 10), theta(943, 10)"""return param[:n_movie * n_features].reshape(n_movie, n_features), \param[n_movie * n_features:].reshape(n_user, n_features)# recommendation fn
def cost_function(param, Y, R, n_features):"""compute cost for every r(i, j)=1Args:param: serialized X, thetaY (movie, user), (1682, 943): (movie, user) ratingR (movie, user), (1682, 943): (movie, user) has rating"""# theta (user, feature), (943, 10): user preference# X (movie, feature), (1682, 10): movie featuresn_movie, n_user = Y.shapeX, theta = deserialize(param, n_movie, n_user, n_features)inner = np.multiply(X @ theta.T - Y, R)return np.power(inner, 2).sum() / 2def gradient(param, Y, R, n_features):# theta (user, feature), (943, 10): user preference# X (movie, feature), (1682, 10): movie featuresn_movies, n_user = Y.shapeX, theta = deserialize(param, n_movies, n_user, n_features)inner = np.multiply(X @ theta.T - Y, R)  # (1682, 943)# X_grad (1682, 10)X_grad = inner @ theta# theta_grad (943, 10)theta_grad = inner.T @ X# roll them together and returnreturn serialize(X_grad, theta_grad)def regularized_cost(param, Y, R, n_features, λ=1):reg_term = np.power(param, 2).sum() * (λ / 2)return cost_function(param, Y, R, n_features) + reg_termdef regularized_gradient(param, Y, R, n_features, λ=1):grad = gradient(param, Y, R, n_features)reg_term = λ * paramreturn grad + reg_termdef normalizeRatings(Y, R):  # 均值归一化,见算法见笔记16.6Y_mean = (Y.sum(axis=1) / R.sum(axis=1)).reshape(-1, 1)Y_norm = (Y - Y_mean) * Rreturn Y_norm, Y_meanif __name__ == '__main__':movies_mat = sio.loadmat('./data/ex8_movies.mat')  # dict_keys(['__header__', '__version__', '__globals__', 'Y', 'R'])# Y->(1682,943)(电影数×用户数)用户电影评分矩阵(数组)# R->(1682,943) 电影评分的二进制值的"指示符"数据,例如[5 4 0 0 3 0]对应[1 1 0 1 0]Y, R = movies_mat.get('Y'), movies_mat.get('R')Y_plot = Y# param_mat-->dict_keys(['__header__', '__version__', '__globals__', 'X', 'Theta', 'num_users', 'num_movies',# 'num_features'])param_mat = sio.loadmat('./data/ex8_movieParams.mat')theta, X = param_mat.get('Theta'), param_mat.get('X')  # theta->(943,10) 10个特征数量 X->(1682,10) 电影特征(偏好程度)矩阵# -------------选取一小段数据,计算代价函数-------------users = 4movies = 5features = 3X_sub = X[:movies, :features]theta_sub = theta[:users, :features]Y_sub = Y[:movies, :users]R_sub = R[:movies, :users]param_sub = serialize(X_sub, theta_sub)cost_sub = cost_function(param_sub, Y_sub, R_sub, features)  # λ=0时的regularized_cost即cost_function函数print("小段数据代价函初始值(λ=0):{}".format(cost_sub))cost_sub1 = regularized_cost(param_sub, Y_sub, R_sub, features, λ=1.5)print("小段数据代价函初始值(λ=1.5):{}".format(cost_sub1))# -------------整个数据,计算的代价函数-----------------param = serialize(X, theta)  # total real paramsprint(param.shape)cost_real = cost_function(serialize(X, theta), Y, R, 10)  # this is real total costprint("所有数据代价函初始值(λ=0):{}".format(cost_real))cost_real_1 = regularized_cost(serialize(X, theta), Y, R, 10, λ=1)print("所有数据代价函初始值(λ=1):{}".format(cost_real_1))# 添加一个新的用户,根据该用户偏好自动推荐电影nm = int(param_mat['num_movies'])  # 电影数量nm=1682new_ratings = np.zeros((nm, 1))new_ratings[0] = 4new_ratings[6] = 3new_ratings[11] = 5new_ratings[53] = 4new_ratings[63] = 5new_ratings[65] = 3new_ratings[68] = 5new_ratings[97] = 2new_ratings[182] = 4new_ratings[225] = 5new_ratings[354] = 5Y = np.c_[Y, new_ratings]  # now I become user 0,在Y最后插入一列,insert貌似只能插入相同数值的列。Y->(1682,944)R = np.c_[R, new_ratings != 0]  # R->(1682,944)n_movie, n_user = Y.shape  # 电影总数量1682,总用户数量944n_features = int(param_mat['num_features'])  # n_features=10X_normal = np.random.standard_normal((n_movie, n_features))  # 把数据转换成高斯分布theta_normal = np.random.standard_normal((n_user, n_features))param_normal = serialize(X_normal, theta_normal)# 均值归一化Y_norm, Y_mean = normalizeRatings(Y, R)  # Y_mean:电影评分的平均值均,Y_norm:均值归一化结果λ = 10res = opt.minimize(fun=regularized_cost, x0=param_normal, args=(Y_norm, R, n_features, λ),method='TNC', jac=regularized_gradient)print(res)X_trained, theta_trained = deserialize(res.x, n_movie, n_user, n_features)print(X_trained.shape, theta_trained.shape)Y_pred = X_trained @ theta_trained.Ty_pred = Y_pred[:, -1] + Y_mean.flatten()  # 还原会类似于原始电影评分的数值结果index = np.argsort(y_pred)  # 将y_pred中的元素从小到大排序后,提取对应原数组的索引indexprint(index)index_reverse = np.flipud(index)  # 数组反转movies = []with open('data/movie_ids.txt', 'r', encoding='latin 1') as f:for line in f:tokens = line.strip().split(' ')movies.append(' '.join(tokens[1:]))print(len(movies))# 推荐用户喜好评分较高的电影for i in range(10):print(index_reverse[i], movies[index_reverse[i]], y_pred[index_reverse[i]])# 通过将矩阵渲染成图像来尝试“可视化”数据,我们不能从这里收集太多,但它确实给我们了解用户和电影的相对密度fig, ax = plt.subplots(figsize=(8, 8))ax.imshow(Y, cmap=plt.cm.hot)ax.set_xlabel('Users')ax.set_ylabel('Movies')# ax = sns.heatmap(Y)fig.tight_layout()plt.show()

吴恩达机器学习课程:编程练习 | (8) ex8-anomaly detection and recommendation相关推荐

  1. 8. 吴恩达机器学习课程-作业8-异常检测和推荐系统

    fork了别人的项目,自己重新填写,我的代码如下 https://gitee.com/fakerlove/machine-learning/tree/master/code 代码原链接 文章目录 8. ...

  2. 7. 吴恩达机器学习课程-作业7-Kmeans and PCA

    fork了别人的项目,自己重新填写,我的代码如下 https://gitee.com/fakerlove/machine-learning/tree/master/code 代码原链接 文章目录 7. ...

  3. 吴恩达机器学习课程笔记一

    吴恩达机器学习课程笔记 前言 监督学习---`Supervised learning` 无监督学习---`Unsupervised learning` 聚类 异常检测 降维 增强学习---`Reinf ...

  4. 吴恩达机器学习课程(第一周)

    吴恩达机器学习课程(第一周) welcome Welcome to Machine learning!(video) 机器学习在各领域的应用很多 比如搜索引擎 图像识别 垃圾邮件处理 这是一门让计算机 ...

  5. github标星11600+:最全的吴恩达机器学习课程资源(完整笔记、中英文字幕视频、python作业,提供百度云镜像!)...

    吴恩达老师的机器学习课程,可以说是机器学习入门的第一课和最热门课程,我在github开源了吴恩达机器学习个人笔记,用python复现了课程作业,成为热门项目,star数达到11671+,曾经有相关报道 ...

  6. 6. 吴恩达机器学习课程-作业6-SVM

    fork了别人的项目,自己重新填写,我的代码如下 https://gitee.com/fakerlove/machine-learning/tree/master/code 代码原链接 文章目录 6. ...

  7. 5. 吴恩达机器学习课程-作业5-偏差和方差

    fork了别人的项目,自己重新填写,我的代码如下 https://gitee.com/fakerlove/machine-learning/tree/master/code 代码原链接 文章目录 5. ...

  8. 4. 吴恩达机器学习课程-作业4-神经网络学习

    fork了别人的项目,自己重新填写,我的代码如下 https://gitee.com/fakerlove/machine-learning/tree/master/code 代码原链接 文章目录 4. ...

  9. 3. 吴恩达机器学习课程-作业3-多分类和神经网络

    fork了别人的项目,自己重新填写,我的代码如下 https://gitee.com/fakerlove/machine-learning/tree/master/code 代码原链接 文章目录 3. ...

  10. 2.吴恩达机器学习课程-作业2-逻辑回归

    fork了别人的项目,自己重新填写,我的代码如下 https://gitee.com/fakerlove/machine-learning/tree/master/code 代码原链接 文章目录 2. ...

最新文章

  1. 强势推荐一位 Python 原创自动化大佬!
  2. MATLAB2013a的license过期的解决办法
  3. HTML5 Canvas 绘制库存变化折线 增加超储告罄线
  4. 数据结构(5)之单链表的操作(补充)
  5. 转自JIM Wang:把 isv.config.xml 按钮事件移动到 entity.onload()
  6. 纹理特征:灰度共生矩阵
  7. druid字段级_Druid的数据结构
  8. 试卷汇编与解析二级C语言,计算机等级考试试卷汇编与解析
  9. mysql 消息队列_MYSQL模拟消息队列(转载) | 学步园
  10. 本周新出开源计算机视觉代码汇总(含图像超分辨、视频目标分割、行人重识别、点云识别等)...
  11. 历史沉重,人人生活在历史里
  12. 腾讯面试官这样问我二叉树,我刚好都会 | 原力计划
  13. NSTimer里的userInfo
  14. Kafka 分布式消息系统详解
  15. 考研数据结构常考的代码题总结 C语言实现
  16. php fckeditor,FCKeditor的PHP配备
  17. PHPWAMP自启异常,服务器重启后Apache等服务不会自动重启的原因分析
  18. 免费直播系统源码,可控的跑马灯,无需焦点
  19. MongoTemplate地理位置查询(标准)
  20. 使用Ubuntu搭建Web服务器

热门文章

  1. 公共建筑能耗监控节能方案改善措施及系统介绍
  2. 黑马程序员-银行调度系统
  3. CSPM项目管理专业人员能力评价证书报考条件与考试形势
  4. javascript禁止页面后退history
  5. 公积金新政被指看上去美 一线城市额度不够用
  6. C++: 计算累积密度函数 (CDF) 非中心贝塔分布(附完整源码)
  7. matlab函数repmat,MATLAB中mean()函数repmat()函数的用法
  8. JSF测试框架特性和性能检测
  9. TikTok的发展历程,TikTok直播带货要怎么做?
  10. libmawt.so: ld.so.1: java: fatal: libXm.so.4: open failed: No such file or directory