原作者：https://github.com/fengdu78/lihang-code
参考作者：https://github.com/Dod-o/Statistical-Learning-Method_Code/blob/master

1.统计学习方法概论

概论

回归

最小二乘法拟合系数

正则化

Code

import numpy as np
import scipy as sp
from scipy.optimize import leastsq
import matplotlib.pyplot as plt#①目标函数
def real_func(x):return np.sin(2*np.pi*x)
#②拟合多项式：将拟合多项式系数 转变成 拟合多项式，例如p=array([2.99999999] x=np.linspace(0, 1, 1000)
def fit_func(p,x):#ps: numpy.poly1d([1,2,3]) 生成 1x^2+2x^1+3x^0f=np.poly1d(p)return f(x)
#③残差：作为最小二乘法输入参数
def residuals_func(p, x, y):ret = fit_func(p, x) - yreturn ret# #④带正则化项的残差（可选）：解决过拟合问题
# #原理：通过权重衰减，减少拟合函数的波动
# regularization=0.0001
# def residuals_func_regularization(p, x, y):
#     ret = fit_func(p, x) - y
#     ret = np.append(ret,
#                     np.sqrt(0.5 * regularization * np.square(p)))  # L2范数作为正则化项
#     return ret#最小二乘法：获取拟合多项式的系数
def fitting(M):"""M为多项式的次数"""# ①目标函数的值加上正态分布噪音->十个用作拟合的噪声点x = np.linspace(0, 1, 10)  # 均步长y_ = real_func(x)y = [np.random.normal(0, 0.1) + y1 for y1 in y_]# ②最小二乘法：获得拟合多项式的参数p_init = np.random.rand(M + 1)#随机初始化多项式系数（M+1个随机数）p_lsq = leastsq(residuals_func, p_init, args=(x, y))#args为需要拟合的实验数据 #比如，(array([2.99999999]), 1)# p_lsq_9 = leastsq(residuals_func, p_init, args=(x, y))# p_lsq_9_regularization = leastsq(residuals_func_regularization, p_init, args=(x, y))print('Fitting Parameters:', p_lsq[0])# 可视化x_points = np.linspace(0, 1, 1000)plt.plot(x_points, real_func(x_points), label='real')plt.plot(x_points, fit_func(p_lsq[0], x_points), label='fitted curve')# plt.plot(x_points, fit_func(p_lsq_9[0], x_points), label='over_fitted curve')# plt.plot(x_points, fit_func(p_lsq_9_regularization[0], x_points), label='regularization curve')plt.plot(x, y, 'bo', label='noise')plt.legend()plt.show()return p_lsq#0次多项式（常数）
p_lsq_0 = fitting(M=0)
print(p_lsq_0)
#1次多项式
# p_lsq_1 = fitting(M=1)
#3次多项式
# p_lsq_3 = fitting(M=3)
#9次多项式
# #多项式曲线通过了每个数据点，但是造成了过拟合
# p_lsq_9 = fitting(M=9)

2.感知机

概念

Code

import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
import matplotlib.pyplot as plt#①load data
#Iris数据集：一共包括150行记录
#前四列为花萼长度，花萼宽度，花瓣长度，花瓣宽度等4个用于识别鸢尾花的属性；第5列为鸢尾花的类别（包括Setosa，Versicolour，Virginica三类）。
#1)数据，label:0,1
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)#数据
df['label'] = iris.target#label:0,1,2
#②Index(['sepal length', 'sepal width', 'petal length', 'petal width', 'label'], dtype='object')
df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label'
]
print(df.label.value_counts())#前50个属于0类，中间50个属于1类，后50个属于2类#2)数据可视化
# plt.scatter(df[:50]['sepal length'], df[:50]['sepal width'], label='0')
# plt.scatter(df[50:100]['sepal length'], df[50:100]['sepal width'], label='1')
# plt.xlabel('sepal length')
# plt.ylabel('sepal width')
# plt.legend()
# plt.show()#②制作数据-label：X-y
#补充知识点df.iloc[]
# df.iloc[0, 1]#选择行号=0，列号=1的数据
# df.iloc[[0, 2], [1, 3]]#选择行号为0和2，列号为1和3的数据
# df.iloc[1:3, 0:3]#选择行号为1-2，列号为0-2的数据，注意切片范围为左闭右开
# df.iloc[:, [True, False, True, False]]#行号全选，选择第1列和第3列数据
# df.iloc[:, lambda df: [0, 2]]#选择dataframe的第1列与第3列
data = np.array(df.iloc[:100, [0, 1, -1]])#前100行，0、1、-1列（取数据集的前100个数据的前两个属性进行练习）
X, y = data[:,:-1], data[:,-1]
y = np.array([1 if i == 1 else -1 for i in y])#y:1,-1#方案一：自定义线性感知机Model
#数据线性可分，二分类数据
#此处为一元一次线性方程
#①利用线性感知机拟合
class Model:def __init__(self):self.w = np.ones(len(data[0]) - 1, dtype=np.float32)#3-1=2，Xw+bself.b = 0self.l_rate = 0.1# self.data = datadef sign(self, x, w, b):y = np.dot(x, w) + breturn y# 随机梯度下降法def fit(self, X_train, y_train):is_wrong = False#调整w、b，![请添加图片描述](https://img-blog.csdnimg.cn/efac0db9159d45ce92eaba406d028c5d.png)
直至误分类点数为零while not is_wrong:wrong_count = 0for d in range(len(X_train)):X = X_train[d]y = y_train[d]#keyif y * self.sign(X, self.w, self.b) <= 0:self.w = self.w + self.l_rate * np.dot(y, X)self.b = self.b + self.l_rate * ywrong_count += 1if wrong_count == 0:is_wrong = Truereturn 'Perceptron Model!'def score(self):pass
perceptron = Model()
perceptron.fit(X, y)#②可视化拟合结果
# x_points = np.linspace(4, 7, 10)
# y_ = -(perceptron.w[0] * x_points + perceptron.b) / perceptron.w[1]#key
# plt.plot(x_points, y_)
# plt.plot(data[:50, 0], data[:50, 1], 'bo', color='blue', label='0')
# plt.plot(data[50:100, 0], data[50:100, 1], 'bo', color='orange', label='1')
# plt.xlabel('sepal length')
# plt.ylabel('sepal width')
# plt.legend()
# plt.show()#方案二：引用sklearn中线性感知机包
import sklearn
from sklearn.linear_model import Perceptron
print(sklearn.__version__)
#①利用线性感知机拟合
clf = Perceptron(fit_intercept=True,max_iter=1000,#tol=None,#key：tol参数规定了如果本次迭代的损失和上次迭代的损失之差小于一个特定值时，停止迭代。所以我们需要设置 tol=None 使之可以继续迭代：shuffle=True)
clf.fit(X, y)# Weights assigned to the features.
print(clf.coef_)
# 截距 Constants in decision function.
print(clf.intercept_)#②可视化拟合结果
# 画布大小
plt.figure(figsize=(10,10))
# 中文标题
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.title('鸢尾花线性数据示例')
plt.scatter(data[:50, 0], data[:50, 1], c='b', label='Iris-setosa',)
plt.scatter(data[50:100, 0], data[50:100, 1], c='orange', label='Iris-versicolor')
# 画感知机的线
x_ponits = np.arange(4, 8)
y_ = -(clf.coef_[0][0]*x_ponits + clf.intercept_)/clf.coef_[0][1]#key
plt.plot(x_ponits, y_)
# 其他部分
plt.legend()  # 显示图例
plt.grid(False)  # 不显示网格
plt.xlabel('sepal length')
plt.ylabel('sepal width')
plt.legend()
plt.show()

3.K近邻法（推荐：例子->代码->概念）

概念

①

②

③

Code

import math
from itertools import combinations
#####################################################
#距离度量公式
#例3.1：已知二维空间三个点，求各种p下，x1的最近邻点（lp）
# def L(x, y, p=2):
#     # x1 = [1, 1], x2 = [5,1]
#     if len(x) == len(y) and len(x) > 1:
#         sum = 0
#         for i in range(len(x)):
#             sum += math.pow(abs(x[i] - y[i]), p)
#         return math.pow(sum, 1 / p)
#     else:
#         return 0
# x1 = [1, 1]
# x2 = [5, 1]
# x3 = [4, 4]
# for i in range(1, 5):
#     r = {'1-{}'.format(c): L(x1, c, p=i) for c in [x2, x3]}#给定各p下，x1的各近邻点的距离度量值
#     print(min(zip(r.values(), r.keys())))#r.values()：给定p下，x1的最近邻点距离度量值， r.keys()：给定p下，x1的最近邻点
################################################################################
#KNN模型：通过‘多数’原则，预测类别
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from collections import Counter
#①iris数据
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['label'] = iris.target
df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']
#print(df)#可视化iris数据
plt.scatter(df[:50]['sepal length'], df[:50]['sepal width'], label='0')
plt.scatter(df[50:100]['sepal length'], df[50:100]['sepal width'], label='1')
plt.xlabel('sepal length')
plt.ylabel('sepal width')
plt.legend()
# plt.show()#②数据划分
data = np.array(df.iloc[:100, [0, 1, -1]])#属性值'sepal length', 'sepal width'，'label'
X, y = data[:,:-1], data[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)#80%训练20%测试#③方案一：自定义KNN模型
class KNN:def __init__(self, X_train, y_train, n_neighbors=3, p=2):"""parameter: n_neighbors 临近点个数parameter: p 距离度量"""self.n = n_neighborsself.p = pself.X_train = X_trainself.y_train = y_traindef predict(self, X):#1)取出n个点，再记录相应距离度量值，类别knn_list = []for i in range(self.n):dist = np.linalg.norm(X - self.X_train[i], ord=self.p)#求ord-范数knn_list.append((dist, self.y_train[i]))#距离度量值，类别#2)取max_dist与剩下点与X的相应距离度量值比较# 只要有小于max_dist，则说明该点‘圈’里还有最近邻点，从而不断更新max_dist与相应类比#直至相邻最近的n个点for i in range(self.n, len(self.X_train)):max_dist_index = knn_list.index(max(knn_list, key=lambda x: x[0]))# x[0]:距离度量值dist = np.linalg.norm(X - self.X_train[i], ord=self.p)if knn_list[max_dist_index][0] > dist:knn_list[max_dist_index] = (dist, self.y_train[i])#3）统计：少数服从多数knn = [k[-1] for k in knn_list]count_pairs = Counter(knn)#Counter({1.0: 3})#max_count = sorted(count_pairs, key=lambda x: x)[-1]max_count = sorted(count_pairs.items(), key=lambda x: x[1])[-1][0]#将字典count_pairs按value的值进行排序,[-1][0]最大value对应字典的keyreturn max_countdef score(self, X_test, y_test):right_count = 0for X, y in zip(X_test, y_test):label = self.predict(X)if label == y:right_count += 1return right_count / len(X_test)
# #调用模型，并验证模型准确度
clf = KNN(X_train, y_train)
# print('Accuracy: %{}'.format(clf.score(X_test, y_test)*100))
test_point = [6.0, 3.0]
# print('Test Point Category: {}'.format(clf.predict(test_point)))#结果可视化：test_point->类别1
plt.scatter(df[:50]['sepal length'], df[:50]['sepal width'], label='0')
plt.scatter(df[50:100]['sepal length'], df[50:100]['sepal width'], label='1')
plt.plot(test_point[0], test_point[1], 'bo', label='test_point')
plt.xlabel('sepal length')
plt.ylabel('sepal width')
plt.legend()
# plt.show()#③方案二：KNN模型包
from sklearn.neighbors import KNeighborsClassifier
clf_sk = KNeighborsClassifier()
clf_sk.fit(X_train, y_train)
# print('Accuracy: %{}'.format(clf_sk.score(X_test, y_test)*100))############################################################################################
#Kd树：快速寻找k个最近邻点
#方案一：自定义kd-tree每个结点中主要包含的数据结构如下
class KdNode(object):def __init__(self, dom_elt, split, left, right):#self.dom_elt = dom_elt  # k维向量节点(k维空间中的一个样本点)self.left = left  # 该结点分割超平面左子空间构成的kd-treeself.right = right  # 该结点分割超平面右子空间构成的kd-tree#self.split = split  # 说明该node是通过（split-维度）排序，中值得到的，因此具有split属性class KdTree(object):def __init__(self, data):k = len(data[0])  #数据维度（这里为两维）#CreateNode逻辑可结合《统计学习方法》第二版，P55页，例3.2理解（不断空间二分-按相应维度）def CreateNode(split, data_set):  # 按第split维划分数据集创建KdNode#递归终止条件if not data_set:  # 数据集为空return None#key1：按维排序data_set.sort(key=lambda x: x[split])#数据按第一维/第二维排序#key2：二分split_pos = len(data_set) // 2#不断进行二分，直至数据集为空median = data_set[split_pos]  #key-key-key 获得中位数分割点#key3：对于左右各子树的下维度排序依据（这里按第一维/第二维）split_next = (split + 1) % k  # 下次排序依据（这里按第一维/第二维）# 递归的创建kd树return KdNode(median,split,CreateNode(split_next, data_set[:split_pos]),  # 创建左子树CreateNode(split_next, data_set[split_pos + 1:]))  # 创建右子树#keyself.root = CreateNode(0, data)  #从第0维分量开始构建kd树,返回根节点#KDTree的前序遍历
def preorder(root):print(root.dom_elt)if root.left:  # 节点不为空preorder(root.left)if root.right:preorder(root.right)#对构建好的kd树进行搜索，寻找与目标点最近的样本点：
from math import sqrt
from collections import namedtuple# 定义一个namedtuple：存放最近坐标点、最近距离和访问过的节点数
result = namedtuple("Result_tuple","nearest_point  nearest_dist  nodes_visited")#find_nearest逻辑可结合《统计学习方法》第二版，P57页，例3.3理解
def find_nearest(tree, point):k = len(point)  # 划分维度数（这里为二维）def travel(kd_node, target, max_dist):#递归终止条件if kd_node is None:return result([0] * k, float("inf"),0)  # python中用float("inf")和float("-inf")表示正负无穷#①定位:nearer_node、further_nodenodes_visited = 1s = kd_node.split  #比较依据：进行分割的维度（后面根据该维度进行相应比较）pivot = kd_node.dom_elt  #查找依据：进行分割的“轴”if target[s] <= pivot[s]:  #key-key-key 如果目标点第s维小于分割轴的对应值(目标离左子树更近)nearer_node = kd_node.left  # 下一个访问节点为左子树根节点further_node = kd_node.right  # 同时记录下右子树else:  # 目标离右子树更近nearer_node = kd_node.right  # 下一个访问节点为右子树根节点further_node = kd_node.left#②遍历1：nearer_node“当前最近点”,并设为超球体max_disttemp1 = travel(nearer_node, target, max_dist)  #进行遍历，找到包含目标点的区域nearest = temp1.nearest_point  #以此叶结点作为“当前最近点”dist = temp1.nearest_dist  #用于更新最近距离nodes_visited += temp1.nodes_visited#更新max_distif dist < max_dist:max_dist = dist  # 最近点将在以目标点为球心，max_dist为半径的超球体内#key：返回依据：超球体是否与超平面相交temp_dist = abs(pivot[s] - target[s])  #第s维上目标点与分割超平面的距离if max_dist < temp_dist:  # 判断超球体是否与超平面相交return result(nearest, dist, nodes_visited)  # 不相交则可以直接返回，不用继续判断#②遍历2：分割点pivot “可能最近点”#计算目标点与分割点的欧氏距离temp_dist = sqrt(sum((p1 - p2)**2 for p1, p2 in zip(pivot, target)))if temp_dist < dist:  # 如果“更近”nearest = pivot  # 更新最近点dist = temp_dist  # 更新最近距离max_dist = dist  # 更新超球体半径#②遍历3：further_node “可能最近点”#检查另一个子结点对应的区域是否有更近的点temp2 = travel(further_node, target, max_dist)nodes_visited += temp2.nodes_visitedif temp2.nearest_dist < dist:  # 如果另一个子结点内存在更近距离nearest = temp2.nearest_point  # 更新最近点dist = temp2.nearest_dist  # 更新最近距离return result(nearest, dist, nodes_visited)#key：遍历起点return travel(tree.root, point, float("inf"))  # 从根节点开始递归 'inf':正无穷#例3.2：给定一个二维空间数据集，构造平衡kd树
#例3.3(1)：求最近邻点
data = [[2,3],[5,4],[9,6],[4,7],[8,1],[7,2]]
kd = KdTree(data)
# print(preorder(kd.root))
ret = find_nearest(kd, [3,4.5])
# print (ret)#例3.3(2)：求最近邻点
from time import clock
from random import random
#
#产生：一个k维随机向量，每维分量值在0~1之间
def random_point(k):return [random() for _ in range(k)]
# 产生：n个k维随机向量
def random_points(k, n):return [random_point(k) for _ in range(n)]
#
N = 400000
t0 = clock()
kd2 = KdTree(random_points(3, N))            # 构建包含四十万个3维空间样本点的kd树
ret2 = find_nearest(kd2, [0.1,0.5,0.8])      # 四十万个样本点中寻找离目标最近的点
t1 = clock()
# print ("time: ",t1-t0, "s")
# print (ret2)##方案二：调用kd-tree工具包
##习题3.2：通过kd树包，求最近邻点
import numpy as np
from sklearn.neighbors import KDTreetrain_data = np.array([(2, 3), (5, 4), (9, 6), (4, 7), (8, 1), (7, 2)])
tree = KDTree(train_data, leaf_size=2)
#(array([[1.80277564]]), array([[0]], dtype=int64))
dist, ind = tree.query(np.array([(3, 4.5)]), k=1)#k:要返回的最近邻点的数量
x1 = train_data[ind[0]][0][0]
x2 = train_data[ind[0]][0][1]# print(ind)#[[0]]
# print(train_data[ind[0]])#[[2 3]]
# print("x点的最近邻点是({0}, {1})".format(x1, x2))#x点的最近邻点是(2, 3)
# print("x点的最近邻距是({})".format(dist))#x点的最近邻距是([[1.80277564]])# ###########################
#习题3.1：(调用KNeighborsClassifier工具包)通过k近邻法进行空间划分，同时k的选择影响模型复杂度与预测准确度
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
#①数据预处理
data = np.array([[5, 12, 1], [6, 21, 0], [14, 5, 0], [16, 10, 0], [13, 19, 0],[13, 32, 1], [17, 27, 1], [18, 24, 1], [20, 20,0], [23, 14, 1],[23, 25, 1], [23, 31, 1], [26, 8, 0], [30, 17, 1],[30, 26, 1],[34, 8, 0], [34, 19, 1], [37, 28, 1]])
X_train = data[:, 0:2]
y_train = data[:, 2]
#②模型引用
models = (KNeighborsClassifier(n_neighbors=1, n_jobs=-1),KNeighborsClassifier(n_neighbors=2, n_jobs=-1))
models = (clf.fit(X_train, y_train) for clf in models)#③可视化总体设置：标题；子图；网格点
titles = ('K Neighbors with k=1', 'K Neighbors with k=2')
fig = plt.figure(figsize=(15, 5))#figsize:指定figure的宽和高，单位为英寸；
plt.subplots_adjust(wspace=0.4, hspace=0.4)#子图间宽/高度内边距，距离单位为子图平均宽度的比例（小数）。浮点数。默认值为0.2
X0, Y1 = X_train[:, 0], X_train[:, 1]
#X0[ 5  6 14 16 13 13 17 18 20 23 23 23 26 30 30 34 34 37]
x_min, x_max = X0.min() - 1, X0.max() + 1
y_min, y_max = Y1.min() - 1, Y1.max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.2),np.arange(y_min, y_max, 0.2))#输入的是网格点的横纵坐标列向量（非矩阵）;输出的是坐标矩阵
#③可视化子图:模型选择；标题选择；子图位置选择
for clf, title, ax in zip(models, titles, fig.subplots(1, 2).flatten()):#1）带有预测属性的等高线图：k=1;k=2Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])#ravel()将数组维度拉成一维数组#np.c_按列叠加两个矩阵，就是把两个矩阵左右组合，要求行数相等Z = Z.reshape(xx.shape)colors = ('red', 'green', 'lightgreen', 'gray', 'cyan')cmap = ListedColormap(colors[:len(np.unique(Z))])#np.unique()去除其中重复的元素，并按元素返回一个新的有序无元素重复的元组或者列表ax.contourf(xx, yy, Z, cmap=cmap, alpha=0.5)#alpha：设定图表的透明度#2）散点图ax.scatter(X0, Y1,c=y_train,s=50, edgecolors='k',cmap=cmap,alpha=0.5)#c:一系列y_train数字通过指定的cmap映射到颜色;edgecolors:点边缘着色;cmap:这里为自定义的cmapax.set_title(title)
# plt.show()# #########################
#习题3.3：通过自定义kd树，求最近邻点
#构建kd树，搜索待预测点所属区域
from collections import namedtuple
import numpy as np# 建立节点类
#namedtuple：存放结点
class Node(namedtuple("Node", "location left_child right_child")):def __repr__(self):return str(tuple(self))# kd tree类
class KdTree():def __init__(self, k=1):self.k = kself.kdtree = None# 构建kd tree#逻辑可结合《统计学习方法》第二版，P55页，例3.2理解（不断空间二分-按相应维度）def _fit(self, X, depth=0):try:k = self.kexcept IndexError as e:return None#key—key-key：中位数序号及排序依据axis = depth % k#排序依据（这里按第一维/第二维）X = X[X[:, axis].argsort()]#argsort()将矩阵X按照axis排序，并返回排序后的下标median = X.shape[0] // 2#中位数序号try:X[median]#key-key-key 获得中位数分割点except IndexError:return Nonereturn Node(location=X[median],left_child=self._fit(X[:median], depth + 1),right_child=self._fit(X[median + 1:], depth + 1))def _search(self, point, tree=None, depth=0, best=None):#if tree is None:return best#k = self.k#①定位：left_child、right_child->location# 注意：point-[[3, 4.5]]if point[0][depth % k] < tree.location[depth % k]:#key-key-key 如果目标点第【depth % k】维小于分割轴的对应值(目标离左子树更近)next_branch = tree.left_childelse:next_branch = tree.right_childif not next_branch is None:best = next_branch.location#②遍历搜索return self._search(point,tree=next_branch,#分支检索depth=depth + 1,#下次排序依据（这里按第一维/第二维）best=best)def fit(self, X):self.kdtree = self._fit(X)return self.kdtreedef predict(self, X):res = self._search(X, self.kdtree)return res
#
KNN = KdTree()
X_train = np.array([[2, 3], [5, 4], [9, 6], [4, 7], [8, 1], [7, 2]])
KNN.fit(X_train)
X_new = np.array([[3, 4.5]])
res = KNN.predict(X_new)
#
x1 = res[0]
x2 = res[1]
# print("x点的最近邻点是({0}, {1})".format(x1, x2))

4.朴素贝叶斯

概论

5.决策树

概论

6.逻辑斯谛回归与最大熵（分类问题）

概念

优化方法举例：梯度下降法

代码

from math import exp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split#key#①data pre-processing；详解见《2.感知机》
def create_data():iris = load_iris()df = pd.DataFrame(iris.data, columns=iris.feature_names)df['label'] = iris.targetdf.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']data = np.array(df.iloc[:100, [0,1,-1]])# print(data)return data[:,:2], data[:,-1]X, Y = create_data()
# print(X)#[[5.1 3.5] [4.9 3.]]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)#②方案一：自定义LogisticRegression模型
class LogisticRegressionClassifier:def __init__(self, max_iter=200, learning_rate=0.01):self.max_iter = max_iterself.learning_rate = learning_ratedef sigmoid(self, x):return 1 / (1 + exp(-x))def data_matrix(self, X):data_mat = []for d in X:data_mat.append([1.0, *d])#key:*dreturn data_matdef fit(self, X, Y):# label = np.mat(y)data_mat = self.data_matrix(X)#这里[1,3],[[1.0, 5.1, 2.5], [1.0, 4.9, 3.1],······]self.weights = np.zeros((len(data_mat[0]), 1), dtype=np.float32)#这里[3,1]for iter_ in range(self.max_iter):for i in range(len(X)):result = self.sigmoid(np.dot(data_mat[i], self.weights))#sigmoid(xw)：这里[1,1]error = Y[i] - resultself.weights += self.learning_rate * error * np.transpose([data_mat[i]])#这里.T->[3,1]# print('LogisticRegression Model(learning_rate={},max_iter={})'.format(#     self.learning_rate, self.max_iter))# def f(self, x):#     return -(self.weights[0] + self.weights[1] * x) / self.weights[2]def score(self, X_test, Y_test):right = 0X_test = self.data_matrix(X_test)#这里[1,3],[[1.0, 4.8, 3.0], [1.0, 5.0, 2.0],]for x, y in zip(X_test, Y_test):result = np.dot(x, self.weights)#xw：这里[1,1]if (result > 0 and y == 1) or (result < 0 and y == 0):#正确计数时的条件right += 1return right / len(X_test)lr_clf = LogisticRegressionClassifier()
lr_clf.fit(X_train, Y_train)#key-key-key：更新权重
# print(lr_clf.score(X_test, Y_test))#测试模型#③可视化分类结果（分割线）
x_points = np.arange(4, 8)
# print(lr_clf.weights)#[[-0.8198753][3.3067517][-5.6368866]]
y_ = -(lr_clf.weights[1] * x_points + lr_clf.weights[0]) / lr_clf.weights[2]#key
# plt.plot(x_points, y_)#分割线
plt.scatter(X[:50,0],X[:50,1], label='0')
plt.scatter(X[50:,0],X[50:,1], label='1')
plt.legend()
# plt.show()###########################################################
#②方案二：调用LogisticRegression模型工具包
# scikit-learn实例
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(max_iter=200)
clf.fit(X_train, Y_train)
# print(clf.score(X_test, Y_test))
# print(clf.coef_, clf.intercept_)#[[ 2.67437021 -2.70708892]] [-6.13567596]，相当于方案一的lr_clf.weights#③可视化分类结果（分割线）
x_ponits = np.arange(4, 8)
y_ = -(clf.coef_[0][0]*x_ponits + clf.intercept_)/clf.coef_[0][1]#key
# plt.plot(x_ponits, y_)#分割线
plt.plot(X[:50, 0], X[:50, 1], 'bo', color='blue', label='0')
plt.plot(X[50:, 0], X[50:, 1], 'bo', color='orange', label='1')
plt.xlabel('sepal length')
plt.ylabel('sepal width')
plt.legend()
# plt.show()############################################
#最大熵模型
import math
from copy import deepcopy#①自定义MaxEntropy模型
class MaxEntropy:def __init__(self, EPS=0.005):#样本参数self._samples = []self._N = 0  # 样本数#标签参数(key:去重、计数)self._Y = set()  # 标签集合，相当去重后的yself._numXY = {}  #单样本(x,y)->出现次数#特征参数self._Ep_ = []  # 样本分布的特征期望值（统计各样本的在总样本中频率，0-1之间）self._n = 0  # 单样本各特征与label建立关系，(x,y)的个数self._C = 0  # 最大特征数（这里为四个特征，一个label）self._xyID = {}  #单样本各特征与label建立关系，(x,y)->idself._IDxy = {}  # id->单样本各特征与label建立关系(x,y)#权值参数self._w = []self._lastw = []  # 上一次w参数值#收敛条件self._EPS = EPSdef loadData(self, dataset):#补充:deepcopy-硬‘id’# import copy# a=[1,2,[3,4]]# b1=copy.copy(a)# b2=copy.deepcopy(a)# a[1]=222# a[2][0]=333#普通赋值b1=a：id(b)==id(a),改变其中一个内部值，另一个随之改变#浅拷贝b2=copy(a),id(b)!=id(a)，id(b[2])==id(a[2])->b2=[1, 2, [333, 4]]#深拷贝b3=deepcopy(a)，id(b)!=id(a)，id(b[2])!=id(a[2])->b3=[1, 2, [3, 4]]self._samples = deepcopy(dataset)for items in self._samples:y = items[0]#['no']X = items[1:]#['sunny', 'hot', 'high', 'FALSE']#去重：y->_Y#计数（去重属性）：(x, y)->_numXYself._Y.add(y)  #这里‘yes’，‘no’#集合中y若已存在则会自动忽略for x in X:if (x, y) in self._numXY:#('sunny','no')self._numXY[(x, y)] += 1else:self._numXY[(x, y)] = 1#样本属性self._N = len(self._samples)#这里为14个样本self._n = len(self._numXY)#这里为19个(x,y)特征对（单样本各特征与label建立关系）self._C = max([len(sample) - 1 for sample in self._samples])#这里最大特征数为4,有一个为label#权值初始化、特征期望值初始化（统计各样本的在总样本中频率，0-1之间）self._w = [0] * self._nself._lastw = self._w[:]self._Ep_ = [0] * self._n#key-key-key：关联 i(id)-(x,y)单样本各特征与label建立关系-_Ep_单样本各特征对分布频率（0-1之间）for i, xy in enumerate(self._numXY):  # 计算特征函数fi关于<经验分布>的期望self._Ep_[i] = self._numXY[xy] / self._Nself._xyID[xy] = iself._IDxy[i] = xy#1)见概念2.def _Zx(self, X):  #计算每个Z(x)值zx = 0for y in self._Y:ss = 0for x in X:if (x, y) in self._numXY:#_numXY具有去重属性ss += self._w[self._xyID[(x, y)]]#针对所有特征对zx += math.exp(ss)return zx#2)见概念2.def _model_pyx(self, y, X):  #计算每个P(y|x)zx = self._Zx(X)ss = 0for x in X:if (x, y) in self._numXY:ss += self._w[self._xyID[(x, y)]]#针对所有特征对pyx = math.exp(ss) / zxreturn pyx#3)key-key-key:针对所有相关sample的条件概率p（y|sample）累加def _model_ep(self, index):  #计算特征函数fi关于<模型>的期望x, y = self._IDxy[index]#例如：sunny noep = 0for sample in self._samples:if x not in sample:#例如：['yes', 'sunny', 'cool', 'normal', 'FALSE']continuepyx = self._model_pyx(y, sample)#一个相关sample的条件概率p（y|sample）ep += pyx / self._N#key-key-key:针对所有相关samplereturn ep#4)相邻权重值差<=_EPSdef _convergence(self):  #判断是否全部收敛for last, now in zip(self._lastw, self._w):if abs(last - now) >= self._EPS:return Falsereturn True#①训练模型：1）、2）、3）、4）def train(self, maxiter=1000):for loop in range(maxiter):self._lastw = self._w[:]for i in range(self._n):#这里14个特征对ep = self._model_ep(i)  # 计算第i个特征的<模型>期望# key-key-key：1）区别：<经验分布>期望（key：分布频率）与<模型>期望（key：针对所有相关sample的条件概率p（y|sample）累加）# 2）更新第i个特征的权值self._w[i] += math.log(self._Ep_[i] / ep) / self._Cif self._convergence():  # 判断是否收敛print("iter:%d" % loop)print("w:", self._w)#iter：664break#②计算预测概率：计算<模型>期望（key：针对所有相关sample的条件概率p（y|sample）累加）def predict(self, X):Z = self._Zx(X)result = {}for y in self._Y:ss = 0for x in X:if (x, y) in self._numXY:ss += self._w[self._xyID[(x, y)]]pyx = math.exp(ss) / Zresult[y] = pyxreturn result#例如这里，{'no': 2.819781341881656e-06, 'yes': 0.9999971802186581}#②数据集
dataset = [['no', 'sunny', 'hot', 'high', 'FALSE'],['no', 'sunny', 'hot', 'high', 'TRUE'],['yes', 'overcast', 'hot', 'high', 'FALSE'],['yes', 'rainy', 'mild', 'high', 'FALSE'],['yes', 'rainy', 'cool', 'normal', 'FALSE'],['no', 'rainy', 'cool', 'normal', 'TRUE'],['yes', 'overcast', 'cool', 'normal', 'TRUE'],['no', 'sunny', 'mild', 'high', 'FALSE'],['yes', 'sunny', 'cool', 'normal', 'FALSE'],['yes', 'rainy', 'mild', 'normal', 'FALSE'],['yes', 'sunny', 'mild', 'normal', 'TRUE'],['yes', 'overcast', 'mild', 'high', 'TRUE'],['yes', 'overcast', 'hot', 'normal', 'FALSE'],['no', 'rainy', 'mild', 'high', 'TRUE']]#③调用模型，载入数据，训练权重
maxent = MaxEntropy()
x = ['overcast', 'mild', 'high', 'FALSE']
# maxent.loadData(dataset)
# maxent.train()
# print(maxent.predict(x))##############################################
#习题6.2 写出Logistic回归模型学习的梯度下降算法。
import numpy as np
import time
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from pylab import mpl# 图像显示中文
mpl.rcParams['font.sans-serif'] = ['Microsoft YaHei']class LogisticRegression:def __init__(self, learn_rate=0.1, max_iter=10000, tol=1e-2):self.learn_rate = learn_rate  # 学习率self.max_iter = max_iter  # 迭代次数self.tol = tol  # 迭代停止阈值self.w = None  # 权重def preprocessing(self, X):"""将原始X末尾加上一列，该列数值全部为1"""row = X.shape[0]#这里为6行y = np.ones(row).reshape(row, 1)X_prepro = np.hstack((X, y))#hstack：水平方向  vstack：垂直方向return X_preprodef sigmod(self, x):return 1 / (1 + np.exp(-x))def fit(self, X_train, y_train):#数据预处理X = self.preprocessing(X_train)#->维度【6，4】y = y_train.T#维度->【6，1】# 初始化权重wself.w = np.array([[0] * X.shape[1]], dtype=np.float64)#[[0. 0. 0. 0.]] #维度【1，4】#k = 0for loop in range(self.max_iter):# 计算梯度z = np.dot(X, self.w.T)#X*self.w.Tgrad = X * (y - self.sigmod(z))#key:维度【6，4】grad = grad.sum(axis=0)#key：按列相加，维度【1，4】# 利用梯度的绝对值作为迭代中止的条件if (np.abs(grad) <= self.tol).all():#all()函数用于判断给定的可迭代参数 iterable中的所有元素是否都为TRUE，如果是返回True，否则返回 Falsebreakelse:# 更新权重w 梯度上升——求极大值self.w += self.learn_rate * gradk += 1# print("迭代次数：{}次".format(k))#3232次# print("最终梯度：{}".format(grad))#最终梯度：[ 0.00144779  0.00046133  0.00490279 -0.00999848]# print("最终权重：{}".format(self.w[0]))#最终权重：[  2.96908597   1.60115396   5.04477438 -13.43744079]def predict(self, x):p = self.sigmod(np.dot(self.preprocessing(x), self.w.T))# p维度:[6,1]# 调用score时，注释掉# print("Y=1的概率被估计为：{:.2%}".format(p[0][0]))# p[np.where(p > 0.5)] = 1# p[np.where(p < 0.5)] = 0return pdef score(self, X, y):y_c = self.predict(X)#X维度[6,3],y_c维度[6,1]error_rate = np.sum(np.abs(y_c - y.T)) / y_c.shape[0]return 1 - error_ratedef draw(self, X, y):# 分离正负实例点y = y[0]#[1 1 1 0 0 0]X_po = X[np.where(y == 1)]#[[3 3 3][4 3 2][2 1 2]]#key-key-key:np.where()返回满足条件的对应元素索引->找到对应X中元素X_ne = X[np.where(y == 0)]#[[1  1  1][-1  0  1][ 2 -2  1]]# 绘制数据集散点图ax = plt.axes(projection='3d')x_1 = X_po[0, :]#[3 3 3]y_1 = X_po[1, :]z_1 = X_po[2, :]x_2 = X_ne[0, :]y_2 = X_ne[1, :]z_2 = X_ne[2, :]#①绘制散点图ax.scatter(x_1, y_1, z_1, c="r", label="正实例")ax.scatter(x_2, y_2, z_2, c="b", label="负实例")ax.legend(loc='best')#②绘制区分平面x = np.linspace(-3, 3, 3)y = np.linspace(-3, 3, 3)x_3, y_3 = np.meshgrid(x, y)a, b, c, d = self.w[0]#[2.96908597 1.60115396 5.04477438 -13.43744079]z_3 = -(a * x_3 + b * y_3 + d) /c #key-key-key：平面方程#可视化ax.plot_surface(x_3, y_3, z_3, alpha=1)  # 调节透明度plt.show()# 训练数据集
X_train = np.array([[3, 3, 3], [4, 3, 2], [2, 1, 2], [1, 1, 1], [-1, 0, 1],[2, -2, 1]])
y_train = np.array([[1, 1, 1, 0, 0, 0]])
# 构建实例，进行训练
clf = LogisticRegression()
clf.fit(X_train, y_train)
# print(clf.score(X_train, y_train))#0.9933665662298423
# clf.draw(X_train, y_train)

7.支持向量机

概论

推导与实现：SMO算法

8.提升方法

概论

【统计学习方法系列】赏析李航教授《监督学习：回归与分类》相关推荐

向毕业妥协系列之机器学习笔记:监督学习-回归与分类（一）
目录一.Machine Learning概念二.单(双)变量线性回归模型 1.代价函数 2.梯度下降三.搭建实验环境一.Machine Learning概念简单在知乎上搜了一下机器学习的学习 ...
最新发布!《统计学习方法》第二版无监督学习视频课上线了！
说起机器学习最响当当的书籍,李航博士的<统计学习方法>一定是榜上有名了!许多互联网企业的面试.笔试题目,都参考这本书. 这本书第一版就包含了众多主要的监督学习算法与模型,通过作者 6 年时 ...
统计学习方法笔记（李航）———第六章（逻辑斯谛回归）
前言: <统计学习方法>虽然分别对两者作了介绍,但没有深入讨论它们之间的联系.本文准备从最大熵模型出发,推导出逻辑斯谛回归模型,并解释两者的联系与区别. 本文主要从以下几个方面进行描述: ...
统计学习方法笔记（李航）———第五章（决策树）
一.特征选择特征选择的目的是筛选最有价值的特征,提高决策树的学习效率,同时避免无价值的特征导致模型过于复杂.这是决策树模型的理论基础,涉及信息论的一些基本内容,书中一笔带过.为了加深理解,我把书中的 ...
统计学习方法笔记（李航）———第三章（k近邻法）
k 近邻法 (k-NN) 是一种基于实例的学习方法,无法转化为对参数空间的搜索问题(参数最优化问题).它的特点是对特征空间进行搜索.除了k近邻法,本章还对以下几个问题进行较深入的讨论: 切比雪夫距 ...
统计学习方法笔记（李航）———第四章（朴素贝叶斯法）
推荐阅读:小白之通俗易懂的贝叶斯定理(Bayes' Theorem) 朴素贝叶斯法是一种多分类算法,它的基础是"朴素贝叶斯假设"(假设实例的各个特征具有条件独立性).根据训练集估计 ...
统计学习方法笔记（李航）———第二章（感知机）
感知机(perceptron)是Rosenblatt在60年代提出的第一个机器学习模型.尽管比较简单,而且有局限性,但它是后续学习"支持向量机"的基础.本章有两个值得注意的地方:一 ...
李航-统计学习方法-笔记-1：概论
写在前面本系列笔记主要记录<统计学习方法>中7种常用的机器学习分类算法,包括感知机,KNN,朴素贝叶斯,决策树,逻辑斯谛回归与最大熵模型,SVM,boosting. 课本还涉及到3种算法 ...
重磅开源！所有的李航老师《统计学习方法》算法代码实现！！！
关注上方"深度学习技术前沿",选择"星标公众号", 资源干货,第一时间送达! 超有料的仓库项目资源---<统计学习方法>代码李航老师的<统计 ...

【统计学习方法系列】赏析李航教授《监督学习：回归与分类》

1.统计学习方法概论

概论

回归

最小二乘法拟合系数

正则化

Code

分类

2.感知机

概念

Code

3.K近邻法（推荐：例子->代码->概念）

概念

①

②

③

Code

4.朴素贝叶斯

概论

5.决策树

概论

6.逻辑斯谛回归与最大熵（分类问题）

概念

优化方法举例：梯度下降法

代码

7.支持向量机

概论

8.提升方法

概论

【统计学习方法系列】赏析李航教授《监督学习：回归与分类》相关推荐

最新文章

热门文章