线性回归

单变量线性回归

准备工作

# 导入需要使用的包
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt# 导入数据集
path =  'ex1data1.txt'
data = pd.read_csv(path, header=None, names=['Population', 'Profit'])
data.head()  # 回数据的前几行，默认五行
data.describe() # 生成描述性统计数据

数据展示：

	Population	Profit
0	6.1101	17.5920
1	5.5277	9.1302
2	8.5186	13.6620
3	7.0032	11.8540
4	5.8598	6.8233

数据统计：

	Population	Profit
count	97.000000	97.000000
mean	8.159800	5.839135
std	3.869884	5.510262
min	5.026900	-2.680700
25%	5.707700	1.986900
50%	6.589400	4.562300
75%	8.578100	7.046700
max	22.203000	24.147000

数据可视化，绘制散点图：

# 第一个参数代表散点图，第二第三个参数是x轴y轴的数据，最后是图的大小
data.plot(kind='scatter', x='Population', y='Profit', figsize=(12,8))
plt.show()

# 在第0列插入一列(Ones)1，即x0恒为1
data.insert(0, 'Ones', 1)# 分离数据集为X和y
cols = data.shape[1]
X = data.iloc[:,0:cols-1]#X是data去掉最后一列
y = data.iloc[:,cols-1:cols]#X是data的最后一列
# 观察下 X (训练集) and y (目标变量)是否正确
X.head()
y.head()# 转化为numpy矩阵
X = np.matrix(X.values)
y = np.matrix(y.values)
# 初始化theta为0向量
theta = np.zeros(shape=(1,X.shape[1]))
# 输出theta（array([[0., 0.]])）
theta
# 查看维度((97, 2), (1, 2), (97, 1))
X.shape, theta.shape, y.shape# 计算代价函数（32.07273387745567）
computeCost(X, y, theta)

其中代价函数为：

def computeCost(X, y, theta):dif = np.dot(X,theta.T)-ycost = np.dot(dif.T,dif)[0,0]/(2*len(X))return cost

梯度下降

梯度下降函数：

# iters是迭代次数
def gradientDescent(X, y, theta, alpha, iters):# 保存更新的thetatemp = np.matrix(np.zeros(theta.shape))# theta中元素的个数parameters = int(theta.shape[1])# 保存迭代后的损失函数，用来画图cost = np.zeros(iters)# 迭代循环for i in range(iters):# 预测值和真实值的误差error = np.dot(X,theta.T)-y# 循环更新theta中元素for j in range(parameters):           round_theta = np.dot(error.T,X[:,j])[0,0]/len(X)temp[0,j]=theta[0,j]-alpha*round_theta      theta = tempcost[i]=computeCost(X,y,theta)return theta, cost

# 初始化参数
alpha = 0.01
iters = 1000# 用梯度下降寻找theta
g, cost = gradientDescent(X, y, theta, alpha, iters)
# 得到的theta为matrix([[-3.24140214,  1.1272942 ]])
g# 计算该theta的损失（4.515955503078913）
computeCost(X, y, g)# 绘制线性模型以及数据，查看拟合情况
x = np.linspace(data.Population.min(), data.Population.max(), 100)
f = g[0, 0] + (g[0, 1] * x)
fig, ax = plt.subplots(figsize=(12,8))
# 用红色画回归直线
ax.plot(x, f, 'r', label='Prediction')
# 画原始数据
ax.scatter(data.Population, data.Profit, label='Traning Data')
ax.legend(loc=2)
ax.set_xlabel('Population')
ax.set_ylabel('Profit')
ax.set_title('Predicted Profit vs. Population Size')
plt.show()

迭代次数和损失的关系（左边看着像竖直的线，但其实是x轴从0到1的）：

fig, ax = plt.subplots(figsize=(12,8))
ax.plot(np.arange(iters), cost, 'r')
ax.set_xlabel('Iterations')
ax.set_ylabel('Cost')
ax.set_title('Error vs. Training Epoch')
plt.show()

多变量线性回归

# 一个房屋价格数据集，其中有2个变量（房子的大小，卧室的数量）和目标（房子的价格）
path =  'ex1data2.txt'
data2 = pd.read_csv(path, header=None, names=['Size', 'Bedrooms', 'Price'])
data2.head()# 预处理——归一化
data2 = (data2 - data2.mean()) / data2.std()
data2.head()# 添加一列x0
data2.insert(0, 'Ones', 1)
# 分离数据集
cols = data2.shape[1]
X2 = data2.iloc[:,0:cols-1]
y2 = data2.iloc[:,cols-1:cols]
# 转化为矩阵并初始化theta
X2 = np.matrix(X2.values)
y2 = np.matrix(y2.values)
theta2 = np.matrix(np.array([0,0,0]))# 用梯度下降计算theta
g2, cost2 = gradientDescent(X2, y2, theta2, alpha, iters)
# 计算最终损失（0.13070286230463776）
computeCost(X2, y2, g2)# 迭代和损失关系图
fig, ax = plt.subplots(figsize=(12,8))
ax.plot(np.arange(iters), cost2, 'r')
ax.set_xlabel('Iterations')
ax.set_ylabel('Cost')
ax.set_title('Error vs. Training Epoch')
plt.show()

正规方程

X^TXθ=X^Ty→θ=(X^TX)^-1X^Ty

def normalEqn(X, y):# your code here  (appro ~ 1 lines)inv = np.dot(X.T,X).Itheta = inv@X.T@yreturn thetafinal_theta2=normalEqn(X, y)
final_theta2.T
print("正规方程的最终参数：",final_theta2.T,"\n梯度下降的最终参数：",g,"\n正规方程的最终损失函数：",computeCost(X, y, final_theta2.T),"\n梯度下降的最终损失函数：",computeCost(X, y, g))
# 正规方程的最终参数： [[-3.89578088  1.19303364]]
# 梯度下降的最终参数： [[-3.24140214  1.1272942 ]]
# 正规方程的最终损失函数： 4.476971375975179
# 梯度下降的最终损失函数： 4.515955503078913

逻辑回归

准备数据

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight') #样式美化
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report#这个包是评价报告data = pd.read_csv('ex2data1.txt', names=['exam1', 'exam2', 'admitted'])
data.head()#查看前五行
data.describe()# 设置样式，markers = ['o','+']表示数据分别用圆圈和加号表示
sns.set(context="notebook", style="white", palette="deep")
sns.lmplot('exam1', 'exam2', hue='admitted', data=data, height=6, fit_reg=False, scatter_kws={"s": 50},markers = ['o','+'])
plt.show()

def get_X(df):ones = pd.DataFrame({'ones': np.ones(len(df))})#ones是m行1列的数据，其实就是x0data = pd.concat([ones, df], axis=1)  # 合并数据，根据列合并return data.iloc[:, :-1].values  # 不返回最后一列。这个操作返回ndarray,不是矩阵def get_y(df):return np.array(df.iloc[:, -1])# 只返回最后一列# 归一化
def normalize_feature(df):return df.apply(lambda column: (column - column.mean()) / column.std())#特征缩放X = get_X(data)
print(X.shape)# (100, 3)y = get_y(data)
print(y.shape)# (100,)

各种函数

Sigmoid函数：

def sigmoid(z):gz = 1/(1+np.exp(-z))return gz# 画出Sigmoid函数的图像
fig, ax = plt.subplots(figsize=(8, 6))
ax.plot(np.arange(-10, 10, step=0.01),sigmoid(np.arange(-10, 10, step=0.01)))
ax.set_ylim((-0.1,1.1))
ax.set_xlabel('z', fontsize=18)
ax.set_ylabel('g(z)', fontsize=18)
ax.set_title('sigmoid function', fontsize=18)
plt.show()

损失函数：

# 初始化theta
theta=np.zeros(X.shape[1])
theta   # array([0., 0., 0.])# a@b与a.dot(b)等价
def cost(theta, X, y):sig = sigmoid(X@theta.T)costf = (np.sum(-y*np.log(sig)-(1-y)*np.log(1-sig)))/len(y)return costf# 因为theta=0，sig为[0.5,...,0.5]，np.log(sig)=np.log(1-sig)=ln([0.5,...,0.5])
# cost = np.sum(-ln([0.5,...,0.5]))/len(y)=np.sum(ln([2,...,2]))/len(y)=ln(2)=0.6931471805599453
cost(theta, X, y)

计算梯度方向：

def gradient(theta, X, y):sig = sigmoid(X@theta.T)grad = ((sig-y).T@X)/len(y)return gradgradient(theta, X, y)
# array([ -0.1       , -12.00921659, -11.26284221])沿着这个方向走梯度下降最快# 使用 scipy.optimize.minimize 去寻找参数
import scipy.optimize as opt
res = opt.minimize(fun=cost, x0=theta, args=(X, y), method='Newton-CG', jac=gradient)
print(res)
#     fun: 0.20349770249067073
#     jac: array([-1.29015205e-05, -7.87933956e-04, -9.06046726e-04])
# message: 'Optimization terminated successfully.'
#    nfev: 72
#    nhev: 0
#     nit: 28
#    njev: 243
#  status: 0
# success: True
#       x: array([-25.15887187,   0.20621202,   0.20145168])

计算准确率：

def predict(x, theta):sig = sigmoid(X@theta.T)y_pred = np.int64(sig>=0.5)return y_predfinal_theta = res.x
y_pred = predict(X, final_theta)
print(classification_report(y, y_pred))

决策边界

决策边界是θ^TX=θ₀+θ₁x₁+θ₂x₂=0
x₂=-θ₀/θ₂ - θ₁x₁/θ₂

print(res.x) # 这是最终的theta
coef = -(res.x / res.x[2])  # 把x2的系数变为-1，
print(coef) # [124.88787185  -1.02363018  -1.        ]
x = np.arange(130, step=0.1)
y = coef[0] + coef[1]*x# 画数据点
data.describe()  # 寻找x，y的范围
sns.set(context="notebook", style="ticks", font_scale=1.5)
sns.lmplot('exam1', 'exam2', hue='admitted', data=data, height=6, fit_reg=False, scatter_kws={"s": 25},markers = ['o','+'])
# 画出决策边界
plt.plot(x, y, 'grey')
plt.xlim(30, 100)
plt.ylim(30, 100)
plt.title('Decision Boundary')
plt.show()

正则化

# 读取数据
df = pd.read_csv('ex2data2.txt', names=['test1', 'test2', 'accepted'])
df.head()# 展示数据
sns.set(context="notebook", style="ticks", font_scale=1.5)
sns.lmplot('test1', 'test2', hue='accepted', data=df, height=6, fit_reg=False, scatter_kws={"s": 50},markers = ['o','+'])
plt.title('Regularized Logistic Regression')
plt.show()

def feature_mapping(x, y, power, as_ndarray=False):data = {"f{}{}".format(i - p, p): np.power(x, i - p) * np.power(y, p)for i in np.arange(power + 1)for p in np.arange(i + 1)}if as_ndarray:return pd.DataFrame(data).valueselse:return pd.DataFrame(data)

例子：

a = np.array([1])
b = np.array([2])
c = feature_mapping(a,b,6)
# 下图按行展开即得[1,1,2,1,2,4,1,2,4,8,1,2,4,8,16,1,2,4,8,16,32,1,2,4,8,16,32,64]

x1 = np.array(df.test1)
x2 = np.array(df.test2)
data = feature_mapping(x1, x2, power=6)
print(data.shape)   # (118, 28)
data.head()
data.describe()# 初始化theta，X，y
theta = np.zeros(data.shape[1])
# X是x1和x2的多项式组合（6次幂）
X = feature_mapping(x1, x2, power=6, as_ndarray=True)
print(X.shape)  # (118, 28)
y = get_y(df)
print(y.shape)  # (118,)

加上正则项后的损失函数：

def regularized_cost(theta, X, y, l=1):# your code here  (appro ~ 3 lines）theta_1 = theta[1:]regu_cost = np.sum(theta_1**2)*l/(2*len(y))return cost(theta, X, y)+regu_cost# 因为theta=0，sig为[0.5,...,0.5]，np.log(sig)=np.log(1-sig)=ln([0.5,...,0.5])
# cost = np.sum(-ln([0.5,...,0.5]))/len(y)=np.sum(ln([2,...,2]))/len(y)=ln(2)=0.6931471805599453
# 又因为theta=0，正则项=0，所以输出0.6931471805599453
regularized_cost(theta, X, y, l=1)

正则化梯度（我们并不希望θ₀=0——过原点，所以j要大于等于1）：

def regularized_gradient(theta, X, y, l=1):   regularized_term = theta*l/len(y)regularized_term[0]=0  # 不修改theta0return gradient(theta, X, y) + regularized_termregularized_gradient(theta, X, y)
#array([8.47457627e-03, 1.87880932e-02, 7.77711864e-05, 5.03446395e-02,
#       1.15013308e-02, 3.76648474e-02, 1.83559872e-02, 7.32393391e-03,
#       8.19244468e-03, 2.34764889e-02, 3.93486234e-02, 2.23923907e-03,
#       1.28600503e-02, 3.09593720e-03, 3.93028171e-02, 1.99707467e-02,
#       4.32983232e-03, 3.38643902e-03, 5.83822078e-03, 4.47629067e-03,
#       3.10079849e-02, 3.10312442e-02, 1.09740238e-03, 6.31570797e-03,
#       4.08503006e-04, 7.26504316e-03, 1.37646175e-03, 3.87936363e-02])

梯度下降寻找参数：

import scipy.optimize as op
print('init cost = {}'.format(regularized_cost(theta, X, y)))    # init cost = 0.6931471805599454
res = opt.minimize(fun=regularized_cost, x0=theta, args=(X, y), method='Newton-CG', jac=regularized_gradient)
res

计算准确率：

final_theta = res.x
y_pred = predict(X, final_theta)
print(classification_report(y, y_pred))

比较不同lambda

通过散点图来话决策边界（决策边界是隐式的）：

# power是x1，x2组合的次数，l是lambda
def draw_boundary(power, l):density = 1000threshhold = 2 * 10**-3# 梯度下降找thetafinal_theta = feature_mapped_logistic_regression(power, l)x, y = find_decision_boundary(density, power, final_theta, threshhold)# 画原始数据点df = pd.read_csv('ex2data2.txt', names=['test1', 'test2', 'accepted'])sns.lmplot('test1', 'test2', hue='accepted', data=df, height=6, fit_reg=False, scatter_kws={"s": 100},markers = ['o','+'])# 画决策边界plt.scatter(x, y, c='r', s=10,marker='.')plt.title('Decision boundary')plt.show()

梯度下降找theta：

def feature_mapped_logistic_regression(power, l):df = pd.read_csv('ex2data2.txt', names=['test1', 'test2', 'accepted'])x1 = np.array(df.test1)x2 = np.array(df.test2)y = get_y(df)X = feature_mapping(x1, x2, power, as_ndarray=True)theta = np.zeros(X.shape[1])res = opt.minimize(fun=regularized_cost,x0=theta,args=(X, y, l),method='TNC',jac=regularized_gradient)final_theta = res.xreturn final_theta

用离散点表示决策边界：

def find_decision_boundary(density, power, theta, threshhold):# 把x轴，y轴分别分成density份t1 = np.linspace(-1, 1.5, density)t2 = np.linspace(-1, 1.5, density)cordinates = [(x, y) for x in t1 for y in t2]x_cord, y_cord = zip(*cordinates)# 求z（z=0为决策边界）mapped_cord = feature_mapping(x_cord, y_cord, power)     inner_product = mapped_cord.values @ theta# z=0附近的都算决策边界decision = mapped_cord[np.abs(inner_product) < threshhold]return decision.f10, decision.f01

lambda=1时：

draw_boundary(power=6, l=1)

lambda=0时：

draw_boundary(power=6, l=0)

lambda=100时：

draw_boundary(power=6, l=100)

【深度之眼吴恩达机器学习第四期】笔记（三）相关推荐

【深度之眼吴恩达机器学习第四期】笔记（十二）
目录大规模学习小批量梯度下降在线学习数据并行应用举例人工合成数据上限分析总结大规模学习现在机器学习的性能比过去的好,其中一个原因就是现在拥有大量的数据. 而且其中一种获得高性能机器 ...
【深度之眼吴恩达机器学习第四期】笔记（五）
目录机器学习诊断一.无超参数时对假设进行评估二.有超参数时对假设进行评估三.过拟合还是欠拟合四.增加还是减小正则化参数λ 五.应该获取更多的数据样本吗总结一下实现算法的推荐方法数据偏差 ...
【深度之眼吴恩达机器学习第四期】笔记（一）
目录第一章:什么是机器学习第二章:线性回归模型第三章:矩阵运算第四章:多变量线性回归正规方程第五章:操作第一章:什么是机器学习机器学习主要分为监督学习(我们教计算机如何学习)和无监督学 ...
【深度之眼吴恩达机器学习第四期】笔记（十）
目录异常点检测高斯分布异常点检测和监督学习选择特征多元高斯分布使用多元高斯分布的异常点检测算法原始模型VS.多元高斯分布的模型编程异常点检测假如有一个关于飞机引擎的数据集,而且这些 ...
【深度之眼吴恩达机器学习第四期】笔记（九）
目录 K均值 K均值算法语言描述伪代码描述解决分离不佳的簇 K均值的损失函数 K均值初始化如何选择K 主成分分析用途1:去除冗余特征用途2:可视化数据直观来理解主成分分析主成分分析与线 ...
【深度之眼吴恩达机器学习第四期】笔记（四）
目录神经网络神经网络训练流程我们已经有线性回归和逻辑回归了,为什么还要使用神经网络呢? 对于一个有两个输入分量(x1,x2)的分类问题,我们使用这两个分量的组合来构造假设函数(图中右上角),可能 ...
【深度之眼吴恩达机器学习第四期】笔记（二）
目录逻辑回归线性回归不适合分类问题逻辑回归函数决策边界损失函数梯度下降多类别分类正则化欠拟合和过拟合线性回归正则化正规方程的正则化逻辑回归正则化逻辑回归逻辑回归虽然叫回归, ...
【深度之眼吴恩达机器学习第四期】笔记（十一）
目录推荐系统基于内容的推荐系统协同过滤均值归一化编程推荐系统以电影推荐系统为例子,假设4个用户(nu=4)对5部电影(nm=5)作出了以下评分,其中"?"代表第j个用 ...
【深度之眼吴恩达机器学习第四期】笔记（八）
目录 SVM 从逻辑回归到SVM 间隔最大理解SVM 直觉上来理解SVM 核函数 SVM编程 SVM 从逻辑回归到SVM 在逻辑回归中,如果标签y=1,我们希望预测值也等于1,那么就需要θTx远远大于 ...
【深度之眼吴恩达机器学习第四期】笔记（七）
目录模型优化损失函数梯度函数正则化梯度与代价函数学习曲线选择最优的超参数lambda 模型优化准备数据: import numpy as np import scipy.io as si ...

【深度之眼吴恩达机器学习第四期】笔记（三）

目录