给定训练集spam_train.csv，要求根据每个ID各种属性值来判断该ID对应角色是Winner还是Losser(收入是否大于50K)，这是一个典型的二分类问题。

训练集介绍：

(1)、CSV文件，大小为4000行X59列;

(2)、4000行数据对应着4000个角色，ID编号从1到4001;

(3)、59列数据中，第一列为角色ID，最后一列为分类结果，即label(0、1两种)，中间的57列为角色对应的57种属性值；
　　　(4)、数据集地址：https://pan.baidu.com/s/1mG7ndtlT4jWYHH9V-Rj_5g，提取码：hwzf 。

import pandas as pd
import numpy as np# 更新参数，训练模型
def train(x_train, y_train, epoch):num = x_train.shape[0]dim = x_train.shape[1]bias = 0  # 偏置值初始化weights = np.ones(dim)  # 权重初始化learning_rate = 1  # 初始学习率reg_rate = 0.001  # 正则项系数bg2_sum = 0  # 用于存放偏置值的梯度平方和wg2_sum = np.zeros(dim)  # 用于存放权重的梯度平方和for i in range(epoch):b_g = 0w_g = np.zeros(dim)# 在所有数据上计算梯度，梯度计算时针对损失函数求导for j in range(num):y_pre = weights.dot(x_train[j, :]) + biassig = 1 / (1 + np.exp(-y_pre))b_g += (-1) * (y_train[j] - sig)for k in range(dim):w_g[k] += (-1) * (y_train[j] - sig) * x_train[j, k] + 2 * reg_rate * weights[k]b_g /= numw_g /= num# adagradbg2_sum += b_g ** 2wg2_sum += w_g ** 2# 更新权重和偏置bias -= learning_rate / bg2_sum ** 0.5 * b_gweights -= learning_rate / wg2_sum ** 0.5 * w_g# 每训练100轮，输出一次在训练集上的正确率# 在计算loss时，由于涉及到log()运算，因此可能出现无穷大，计算并打印出来的loss为nan# 有兴趣的同学可以把下面涉及到loss运算的注释去掉，观察一波打印出的lossif i % 3 == 0:# loss = 0acc = 0result = np.zeros(num)for j in range(num):y_pre = weights.dot(x_train[j, :]) + biassig = 1 / (1 + np.exp(-y_pre))if sig >= 0.5:result[j] = 1else:result[j] = 0if result[j] == y_train[j]:acc += 1.0# loss += (-1) * (y_train[j] * np.log(sig) + (1 - y_train[j]) * np.log(1 - sig))# print('after {} epochs, the loss on train data is:'.format(i), loss / num)print('after {} epochs, the acc on train data is:'.format(i), acc / num)return weights, bias# 验证模型效果
def validate(x_val, y_val, weights, bias):num = 500# loss = 0acc = 0result = np.zeros(num)for j in range(num):y_pre = weights.dot(x_val[j, :]) + biassig = 1 / (1 + np.exp(-y_pre))if sig >= 0.5:result[j] = 1else:result[j] = 0if result[j] == y_val[j]:acc += 1.0# loss += (-1) * (y_val[j] * np.log(sig) + (1 - y_val[j]) * np.log(1 - sig))return acc / numdef main():# 从csv中读取有用的信息df = pd.read_csv('spam_train.csv')# 空值填0df = df.fillna(0)# (4000, 59)array = np.array(df)# (4000, 57)x = array[:, 1:-1]# scalex[:, -1] /= np.mean(x[:, -1])x[:, -2] /= np.mean(x[:, -2])# (4000, )y = array[:, -1]# 划分训练集与验证集x_train, x_val = x[0:3500, :], x[3500:4000, :]y_train, y_val = y[0:3500], y[3500:4000]epoch = 30  # 训练轮数# 开始训练w, b = train(x_train, y_train, epoch)# 在验证集上看效果acc = validate(x_val, y_val, w, b)print('The acc on val data is:', acc)if __name__ == '__main__':main()

after 0 epochs, the acc on train data is: 0.6134285714285714
after 3 epochs, the acc on train data is: 0.8994285714285715
after 6 epochs, the acc on train data is: 0.914
after 9 epochs, the acc on train data is: 0.9168571428571428
after 12 epochs, the acc on train data is: 0.9225714285714286
after 15 epochs, the acc on train data is: 0.9242857142857143
after 18 epochs, the acc on train data is: 0.9251428571428572
after 21 epochs, the acc on train data is: 0.9242857142857143
after 24 epochs, the acc on train data is: 0.9248571428571428
after 27 epochs, the acc on train data is: 0.9248571428571428
The acc on val data is: 0.94

import pandas as pd
import numpy as np
def train(x_train,y_train,epoch):num=x_train.shape[0]dim=x_train.shape[1]bias=0weights=np.ones(dim)learning_rate=1reg_rate=0.001bg2_sum=0wg2_sum=np.zeros(dim)for i in range(epoch):b_g=0w_g=np.zeros(dim)for j in range(num):y_pre=weights.dot(x_train[j,:])+biassig=1/(1+np.exp(-y_pre))b_g+=(-1)*(y_train[j]-sig)for k in range(dim):w_g[k]+=(-1)*(y_train[j]-sig)*x_train[j,k]+2*reg_rate*weights[k]b_g/=numw_g/=numbg2_sum+=b_g**2wg2_sum+=w_g**2bias-=learning_rate/bg2_sum**0.5*b_gweights-=learning_rate/wg2_sum**0.5*w_gif i%3==0:acc=0result=np.zeros(num)for j in range(num):y_pre=weights.dot(x_train[j,:])+biassig=1/(1+np.exp(-y_pre))if sig>=0.5:result[j]=1else:result[j]=0if result[j]==y_train[j]:acc+=1.0print('{}epochs'.format(i),acc/num)return  weights,bias
def validate(x_val,y_val,weights,bias):num=500acc=0result=np.zeros(num)for j in range(num):y_pre=weights.dot(x_val[j,:])+biassig=1/(1+np.exp(-y_pre))if sig>=0.5:result[j]=1else:result[j]=0if result[j]==y_train[j]:acc+=1.0 return acc/num
def main():df=pd.read_csv('spam_train.csv')df=df.fillna(0)array=np.array(df)x=array[:,1:-1]x[:,-1]/=np.mean(x[:,-1])x[:,-2]/=np.mean(x[:,-2])y=array[:,-1]x_train,x_val=x[0:3500,:],x[3500:4000]y_train,y_val=y[0:3500],y[3500:4000]epoch=30w,b=train(x_train,y_train,epoch)acc=validate(x_val,y_val,w,b)print(acc)
if __name__=='__main__':main()```pythondf=pd.read_csv('spam_train.csv')

df.head()

	1	0.1	1.13	0.37	0.3	0.4	0.5	0.6	...	0.145	0.43	0.436	0.44	1.792	55	147	0.46
0	2	0.00	0.60	0.00	0.60	0.00	0.0	0.60	...	0.143	0.047	0.191	0.143	2.041	31	196	1
1	3	0.00	0.48	0.00	0.00	0.00	0.0	0.00	...	0.000	0.000	0.450	0.000	1.138	4	41	0
2	4	0.51	0.00	0.51	0.51	1.02	0.0	0.00	...	0.142	0.000	0.071	1.212	7.025	130	281	1
3	5	0.00	0.00	0.00	0.00	0.00	0.0	0.64	...	0.116	0.000	0.232	0.000	1.551	6	45	0
4	6	0.80	0.00	0.60	0.00	0.00	0.2	0.00	...	0.000	0.000	0.060	0.000	2.533	43	228	0

5 rows × 59 columns

df.shape

(4000, 59)

df.columns

Index(['1', '0', '0.1', '1.13', '0.2', '0.37', '0.3', '0.4', '0.5', '0.6','0.7', '0.8', '0.9', '0.10', '0.11', '0.12', '0.37.1', '0.13', '0.37.2','1.13.1', '0.14', '0.37.3', '0.15', '0.16', '0.17', '0.18', '0.19','0.20', '0.21', '0.22', '0.23', '0.24', '0.25', '0.26', '0.27', '0.28','0.29', '0.30', '0.31', '0.32', '0.33', '0.34', '0.35', '0.36','0.37.4', '0.38', '0.39', '0.40', '0.41', '0.42', '0.145', '0.43','0.436', '0.44', '0.45', '1.792', '55', '147', '0.46'],dtype='object')

from sklearn.impute import SimpleImputer
imp_mean=SimpleImputer()#默认均值填补
imp_median=SimpleImputer(strategy='median')#中位数填补
imp_0=SimpleImputer(strategy='constant',fill_value=0)#0填补
imp_mean=imp_mean.fit_transform(Age)
data.loc[:,'Age']=imp_mean##用pandas和Numpy填补更简单
import pandas as pd
import numpy as np
data.loc[:,'Age']=data.loc[:,'Age'].fillna(data.loc[:,'Age'].median())

#将标签类别进行编码 ，LabelEncoder 类别专用
from sklearn.preprocessing import LabelEncoder
y=data.iloc[:,-1]
le=LabelEncoder()
le=le.fit(y)
label=le.transform(y)
#或者le.fit_transform(y)
data.iloc[:,-1]=label
#或
data.iloc[:,-1]=LabelEncoder().fit_transform(data.iloc[:,-1])

#连续数据二值化,大于某阙值设为1，否则设为0
from sklearn.preprocessing import Binarizer
x=data.iloc[:,0].values.reshape(-1,-1)#将array转化为矩阵
transformer=Binarizer(threshold=1).fit_transform(x)
data.iloc[:,0]=transformer

import pandas as pd
df=pd.read_csv(r'digit_set/train.csv')

D:\soft\Python\lib\importlib\_bootstrap.py:219: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObjectreturn f(*args, **kwds)
D:\soft\Python\lib\importlib\_bootstrap.py:219: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObjectreturn f(*args, **kwds)

df.head()

	label	...
0	1	...
1	0	...
2	1	...
3	4	...
4	0	...

5 rows × 785 columns

特征选择：方差过滤

X=df.iloc[:,1:]
y=df.iloc[:,0]from sklearn.feature_selection import VarianceThreshold
select=VarianceThreshold()#实例化，不填参数默认方差是0
X_var0=select.fit_transform(df)#获取删除不合格特征之后的特征矩阵
#可写作X_var=VarianceThreshold()
X_var0.shape

(42000, 709)

#若特征是伯努利随机变量，假设二分类特征中的某种分类占到80%以上的时候删除特征
X_bvar=varianceThreshold(.8*(1-.8).fit_transform(X))

%%timeit#统计一个cell的运行时间

#卡方校验
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2h
x_fschi=SelectKBest(chi2,k=300).fit_transform(x_fsvar,y)#k表示所需要的特征,x_fsvar特征矩阵h
x_fschi.shape
cross_val_score(RFC(n_estimators=10,random_state=0),x_fschi,y,cv=5).mean()# #若k值不确定，可用学习曲线检测
# import matplotlib.pyplot as plt
# score=[]
# for i in range(390,200,-10):
#     x_fschi=SelectKBest(chi2,k=i).fit_transform(x_fschi,y)
#     once=cross_val_scor(RFC(n_estimators=10,random_state=0),x_fschi,y,cv=5).mean()
#     score.append(once)
# plt.plot(range(350,200,-10),score)
# plt.show()

#嵌入式选择
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier as RFCRFC_=RFC(n_estimators=10,random_state=0)
X_embedded=SelectFromModel(RFC_,threshold=0.005).fit_transform(X,y)#threshold阙值

用随机森林填补缺失值


#将有缺失要填补的那一列取出作为预测类，有标记的部分作为实际值，空缺部分作为要预测的部分
df=X.copy()
fill=df.loc[:,to_fill]#to_fill是要填补的那一列的名称
#取出剩下的列和标签列
df=pd.concat(df.loc[:,df.columns!=to_fill],pd.DataFrame(y)],axis=1)
Ytrain=fill[fill.notnull()]
Ytest=fill[fill.isnull()]#找出缺失值样本的索引
Xtrain=df.iloc[Ytrain.index,:]#未缺失部分作为训练集
Xtest=df.iloc[Ytest.index,:]#缺失部分作为测试集
from sklearn.ensemble import RandomForestRegressor as rfr
rfr=rfr(n_estimators=100)
rfr=rfr.fit(Xtrain,Ytrain)

data.describe([0.01,0.1,0.25,0.5,0.75,0.99])

#类别不平衡：上采样
import imblearn
from imblearn.over_sampling import SMOTE
sm=SMOTE(random_state=42)
X,y=sm.fit_sample(X,y)#返回已经上采样的特征矩阵
n_sample_=X.shape[0]
n_1_sample=y.value_counts()[1]
n_0_sample=y.value_counts()[0]

逻辑回归实例，特征预处理相关推荐

MATLAB逻辑回归实例及代码
MATLAB逻辑回归实例及代码逻辑回归基本流程: 注:回归系数W更新公式写错了,应该是减号,错写成加号了. 训练数据(包含训练样本及对应的标签)百度云链接:https://pan.baidu.com ...
逻辑回归实例--乳腺癌肿瘤预测
文章目录 0.前言 1.导入数据 2.数据预处理 3.准备训练测试数据 4. 标准化数据 5.逻辑回归模型 6.性能分析 7.十折交叉验证 0.前言环境:Python3.6.5 编译器:jupyte ...
python (2) 逻辑回归实例
import torch import pandas as pd import numpy as np import matplotlib.pyplot as plt#数据预处理 #因为把数据集第一列 ...
sklearn机器学习（六）逻辑回归实例乳腺癌检测
本节采用逻辑回归算法完成乳腺癌的检测. 逻辑回归主要用于这种二项分类问题,采用sigmoid函数作为预测函数,当x=0时,sigmoid函数的值为0.5,之后向两边趋近,因此它得到的结果都是非黑及白的 ...
机器学习 -- 二元逻辑回归实例
二元逻辑回归可用于向量的概率预测,是一种分类算法.迭代方式可选择最小二乘法或梯度下降. 迭代之后会得到每个特征的系数. 公式(对于有N个特征的向量,其中w[i]为权): f(x) = 1/(1+ e^ ...
逻辑回归实例 java_使用MATLAB进行简单的二元逻辑回归
我正在使用MATLAB进行逻辑回归,以解决一个简单的分类问题 . 我的协变量是一个介于0和1之间的连续变量,而我的分类响应是0(不正确)或1(正确)的二进制变量 . 我正在寻找运行逻辑回归来 Buil ...
逻辑回归实例：从疝气病预测病马的死亡率
先了解大致模型建立的流程,接下来我会弄一些算法的原理及梯度上升的对比: 数据有三部分: 梯度算法公式: 附完整版代码: from numpy import * import numpy as np d ...
R语言LR逻辑回归实例
二分类实例去掉setosa类 index <- which(iris$Species == 'setosa') iris <- iris[- index,]training <-i ...
机器学习与高维信息检索 - Note 3 - 逻辑回归（Logistic Regression）及相关实例
逻辑回归 Logistic Regression 3. 逻辑回归补充: 凸性 Convexity 定义3.1 定理3.2 定理3.3 成本函数的凸性 3.1逻辑回归的替代方法 3.2 线性可分性和逻 ...

逻辑回归实例，特征预处理

特征选择：方差过滤

用随机森林填补缺失值

逻辑回归实例，特征预处理相关推荐

最新文章

热门文章

	label	...
0	1	...
1	0	...
2	1	...
3	4	...
4	0	...

	label	...
0	1	...
1	0	...
2	1	...
3	4	...
4	0	...

	label	...
0	1	...
1	0	...
2	1	...
3	4	...
4	0	...