1.原始数据预览

原始数据是北京的空气质量数据，如下图所示：

其中第二列有缺失值，但是该列是由根据其他列（具体是哪一列我忘了）的数值大小来赋值的，因此第二列可以直接删掉。

import pandas as pd
import torch
import numpy as np
from torch import optim
from torch import nn
from torch.utils.data import DataLoader,Dataset
import pickle

2.数据预处理

这个数据集比较干净，只做了归一化处理。

#原始数据预处理
ori = pd.read_excel('./beijing.xlsx',header=None)data = ori.drop(columns=2)col_names = ['date','aqi','pm2','pm10','so2','co','no2','o3']data.columns = ['date','aqi','pm2','pm10','so2','co','no2','o3']pro = data.iloc[:,1:]for name in col_names[1:]:pro[name] = (pro[name]-pro[name].min())/(pro[name].max()-pro[name].min())#归一化处理pro['date'] = data['date']pro.to_excel('pro_data.xlsx',index=False)pro = pd.read_excel('./pro_data.xlsx')

经过预处理后的数据长这样：

3.设置超参数

#设置超参数
x_timesteps = 5 #用多少期去预测（在RNN模型中x_timesteps就是cell的个数）
y_timesteps = 1 #老师说一般都只预测一期，所以y_timestpes应该就是固定值1，但是后面Env我懒得改了，所以这里还是保留了y_timesteps这个超参数
stride = 2 #每次移动多少期来采样
hidden_size = 20
hidden_layers = 1
y_features = 7 #最终你想要预测多少个特征，比如我用前5期的7个特征预测滞后期的7个特征，那么y_features就是7；如果只想预测其中某个特征，那么y_features就是1
if y_features<7:the_col_wanted =  [int(x) for x in input('输入您想要的列，若超过1列，请用,隔开（英文逗号）。您选择的列是：').split(',')] #你想要预测的特征，这个长度必须与y_features一致if len(the_col_wanted) == y_features:print('您最终选择的列是：',the_col_wanted)else:print('您的选择有误,请重新进行选择')
else:the_col_wanted = list(range(7))batch_size = 32
epochs = 200

4.搭建网络

#搭建网络
class Net(nn.Module):def __init__(self):super(Net,self).__init__()self.lstm = nn.LSTM(input_size=7,hidden_size=hidden_size,num_layers=hidden_layers,batch_first=True) #注意这里指定了batch_first为true哈#这里设置了两个线性层，其实设置一层也可以self.linear1 = nn.Linear(in_features=hidden_size,out_features=int(hidden_size/2))self.linear2 = nn.Linear(in_features=int(hidden_size/2),out_features=y_features)def forward(self,x,h0):out,(h,c) = self.lstm(x)# x的size是batch_size*x_timesteps*x_features   本例中的x_features是7# LSTM的最终的输出是3个，h和c都是最后一个时刻的h、c# out的size是batch_size*x_timesteps*hidden_size# h和c 的size是(num_directions*num_layers,batch_size,hidden_size)。注意，不管有没有设置batch_first=True,batch_size永远在h和c的size的第二个。（而设置了batch_first=True之后，batch_size在output的size的第一个）out = out[:,-1,:]  #只要最后一个cell的输出结果，out的size变为batch_size*1*hidden_size out = out.reshape(batch_size,-1)out = self.linear1(out)out = self.linear2(out).reshape(batch_size,1,y_features)#经过整个网络之后，size由batch_size*x_timesteps*x_features变成了batch_size*1*y_featuresreturn out

5.自定义dataset

#创建数据集
class Env(Dataset):def __init__(self,root,x_timesteps,y_timesteps,stride,mode):super(Env,self).__init__()self.data = pd.read_excel(root).iloc[:,:-1].valuesself.x_timesteps = x_timestepsself.y_timesteps = y_timestepsself.stride = strideself.mode = modeself.samples = self.creat_xy('./final_sample.pkl')self.x = self.samples[:,:-self.y_timesteps,:]if self.y_timesteps==1:self.y = self.samples[:,-1,the_col_wanted].reshape(len(self.x),1,y_features)else:self.y = self.samples[:,-self.y_timesteps:,the_col_wanted]if self.mode == 'train':self.x = self.x[:int(0.6*len(self.x)),:,:]self.y = self.y[:int(0.6*len(self.y)),:,:]if self.mode == 'val':self.x = self.x[int(0.6*len(self.x)):int(0.8*len(self.x)),:,:]self.y = self.y[int(0.6*len(self.y)):int(0.8*len(self.y)),:,:]if self.mode == 'test':self.x = self.x[int(0.8*len(self.x)):,:,:]self.y = self.y[int(0.8*len(self.y)):,:,:]def creat_xy(self,save_path): #此函数用于创造sample，每个样本的size是x_timesteps+y_timesteps*7#前面的x_timesteps*7就是放入网络中的每个样本，后面的y_timestps*7就是原始的true_yindex = 0samples = []while (index + self.x_timesteps + self.y_timesteps) <= (len(self.data) - 1):single_sample = self.data[index : index + self.x_timesteps+self.y_timesteps,:]samples.append(single_sample)#每个single_sample的size是x_timesteps+y_timesteps*7#前面的x_timesteps*7就是放入网络中的每个样本，后面的y_timestps*7就是原始的true_yindex += self.strideelse:final_sample = torch.from_numpy(np.array(samples))with open(save_path, 'wb') as f:     # 将数据写入pkl文件pickle.dump(final_sample, f)return final_sampledef __len__(self):return len(self.x)def __getitem__(self,idx):x,y = self.x[idx,:,:],self.y[idx,:,:]return x,y

6.训练模型，并用val_set选出最佳模型

#准备好数据
train_db = Env('./pro_data.xlsx',x_timesteps,y_timesteps,stride,'train')
val_db = Env('./pro_data.xlsx',x_timesteps,y_timesteps,stride,'val')
test_db = Env('./pro_data.xlsx',x_timesteps,y_timesteps,stride,'test')
train_loader = DataLoader(train_db,batch_size,shuffle=True,drop_last=True)
val_loader = DataLoader(val_db,batch_size,shuffle=False,drop_last=True)
test_loader = DataLoader(test_db,batch_size,shuffle=False,drop_last=True)#初始化模型、定义损失函数、优化器
model = Net()
h0,c0 = torch.zeros([hidden_layers,batch_size,hidden_size]),torch.zeros([hidden_layers,batch_size,hidden_size])
loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(),lr=1e-3)
best_loss = 99999  #因为希望val_loss不断减小，所以初始的val_loss设置大一点#设置一个evaluate函数，用于评估模型的效果(这里使用loss来衡量，根据实际情况，也可以选择precision、recall、F_β score、auc等来评估)
def evaluate(loader_name):loss_for_all_batch = []for batch_index,(x,y) in enumerate(loader_name):input_x = x.float()true_y = y.float()with torch.no_grad():pre_y = model.forward(input_x,(h0,c0))loss = loss_fn(pre_y,true_y) #每个batch的lossloss_for_all_batch.append(loss)loss_for_this_loader = np.mean(loss_for_all_batch) #用所有batch loss的均值代表该数据集上的总体loss水平return loss_for_this_loader  #开始训练
for it in range(epochs):for batch_index,(x,y) in enumerate(train_loader):input_x = x.float()true_y = y.float()pre_y = model.forward(input_x,(h0,c0))loss = loss_fn(pre_y,true_y)optimizer.zero_grad()loss.backward()optimizer.step()if (batch_index+1)%10==0:print('epoch：',it+1,'   batch_index:',batch_index+1,'  loss:',loss.item())#每隔两个epoch就在val上看一下效果if (it+1)%2 == 0:loss_for_val = evaluate(val_loader)if loss_for_val<best_loss:print('已经完 成了{}次迭代，val的loss有所下降,val_loss为：{}'.format(it+1,loss_for_val))best_epoch = it+1best_loss = loss_for_valtorch.save(model.state_dict(),'best_model_ckp.txt')print('模型已训练完成，最好的epoch是{}，在验证集上的loss是{}'.format(best_epoch,best_loss))model.load_state_dict(torch.load('best_model_ckp.txt'))
print('已将参数设置成训练过程中的最优值，现在开始测试test_set')
loss_for_test = evaluate(test_loader)
print('测试集上的loss为：',loss_for_test)

epoch： 1    batch_index: 10   loss: 0.06613270193338394
epoch： 1    batch_index: 20   loss: 0.051468655467033386
epoch： 2    batch_index: 10   loss: 0.05124117061495781
epoch： 2    batch_index: 20   loss: 0.056616220623254776
已经完 成了2次迭代，val的loss有所下降,val_loss为：0.026271890848875046
epoch： 3    batch_index: 10   loss: 0.035714808851480484
epoch： 3    batch_index: 20   loss: 0.02011280320584774
epoch： 4    batch_index: 10   loss: 0.025625046342611313
epoch： 4    batch_index: 20   loss: 0.027151888236403465
已经完 成了4次迭代，val的loss有所下降,val_loss为：0.01288842223584652
epoch： 5    batch_index: 10   loss: 0.022536305710673332
... ...
epoch： 199    batch_index: 20   loss: 0.011257675476372242
epoch： 200    batch_index: 10   loss: 0.009443326853215694
epoch： 200    batch_index: 20   loss: 0.011200820095837116
模型已训练完成，最好的epoch是54，在验证集上的loss是0.006189089734107256
已将参数设置成训练过程中的最优值，现在开始测试test_set
测试集上的loss为： 0.005623153

LSTM实战:空气质量预测相关推荐

【RNN入门到实战】LSTM从入门到实战——实现空气质量预测
摘要 LSTM是一种时间递归神经网络,它出现的原因是为了解决RNN的一个致命的缺陷.RNN在处理长期依赖(时间序列上距离较远的节点)时,因为计算距离较远的节点之间的联系时会涉及雅可比矩阵的多次相乘,会 ...
数据挖掘机器学习[七]---2021研究生数学建模B题空气质量预报二次建模求解过程：基于Stacking机器学习混合模型的空气质量预测｛含码源+pdf文章｝
相关文章: 特征工程详解及实战项目[参考] 数据挖掘---汽车车交易价格预测[一](测评指标:EDA) 数据挖掘机器学习---汽车交易价格预测详细版本[二]{EDA-数据探索性分析} 数据挖掘机器学习 ...
基于ConvLSTM的伦敦空气质量预测(1) 数据处理
基于ConvLSTM的伦敦空气质量预测(1) 数据处理实验介绍该实验使用了ConvLSTM模型,对伦敦地区的空气质量进行了时序预测.数据集来源于开源库openair.实验的目标是预测Bloomsb ...
双向长短期记忆网络模型_基于深度双向长短期记忆网络的空气质量预测方法与流程...
[技术领域] 本发明涉及一种基于深度双向长短期记忆网络的空气质量预测方法,属于空气污染预测领域. 背景技术: 空气污染物浓度的预测拥有很强的学科交叉性,一直是环境.气象.数学.地理及计算机科学领域研究 ...
空气质量预测灰色预测模型模糊综合评价模型
摘要近年来,随着工业生产的发展和城市人口的迅速增长,城市大气污染日趋严重,2018年,国务院正式印发了<打赢蓝天保卫战三年行动计划>,空气质量的好坏严重影响了人民的日常生活,为此研究 ...
吃鸡排名预测挑战赛空气质量预测英雄联盟大师预测手机行为识别员工离职预测猫十二分类体验赛
1.吃鸡排名预测挑战赛 https://aistudio.baidu.com/aistudio/competition/detail/155/0/introduction 2.空气质量预测https: ...
回归预测 | MATLAB实现ELM极限学习机多输入单输出(空气质量预测)
回归预测 | MATLAB实现ELM极限学习机多输入单输出(空气质量预测) 目录回归预测 | MATLAB实现ELM极限学习机多输入单输出(空气质量预测) 效果一览基本描述程序设计参考资料效 ...
基于ConvLSTM的伦敦空气质量预测(2) 算法实施
介绍该实验使用了ConvLSTM模型,对伦敦地区的空气质量进行了时序预测.数据集来源于开源库openair.实验的目标是预测Bloomsbury的空气污染物数值.同时,也利用了Harlington, ...
利用LSTM进行空气指数预测
毕设终于结束,感谢指导老师以及团队大伙们的辛苦付出,是时候总结一下毕设的内容了. 我们团队的毕业设计是关于利用递归神经网络模型LSTM(long-short-term memory)对中国主要城市的空 ...

LSTM实战:空气质量预测

文章目录

1.原始数据预览

2.数据预处理

3.设置超参数

4.搭建网络

5.自定义dataset

6.训练模型，并用val_set选出最佳模型

LSTM实战:空气质量预测相关推荐

最新文章

热门文章