Python实战代码之时序预报分析-随机森林&XGBOOST&LSTM
Python实战代码之时序预报分析-随机森林&XGBOOST&LSTM。利用Random forest/XGB/LSTM模型进行建模。选择不同模型只需要更改main.py中89行即可运行。ModelSelected = 'random forest'可选的是'random forest' / 'XGB' / 'LSTM'。模型推荐需要调参(LSTM模型除外),CvState = True;如果不想
写在前面
背景介绍:对逐十分钟的风速数据进行外推预报,即用T日的144个时序数据(6个/小时*24小时)预报T+1日144个时序数据,可以先通过EEMD等滤波方法分解成不同频率的分量分别建模。文件数据分别如下:
-文件夹
-Scripts
-main.py
-LSTM.py
-Functs_LSTM.py
-imf1.xlsx
-imf2.xlsx
-imf3.xlsx
-imf4.xlsx
-imf5.xlsx
-imf6.xlsx
-res.xlsx
代码
main.py 脚本说明:
功能:针对IMF1-6 & Res 数据的时间序列利用Random forest/XGB/LSTM模型进行建模
注意:1)main.py脚本需要放在Scripts文件夹中,Scripts文件夹放在IMF1-6 & Res 数据同级目录
2)Scripts文件夹还有LSTM.py和LSTMFunctions.py脚本,是用来调用LSTM模型用的。
3)选择不同模型是,只需要更改main.py中89行:ModelSelected = 'random forest' 可选的是'random forest' / 'XGB' / 'LSTM'
4)模型推荐需要调参(LSTM模型除外),CvState = True;如果不想对random forest和XGB调参,则将main.py中91行修改CvState = False
5)脚本中针对random forest和XGB调参,写了相应关键参数的阈值调参,如果需要更改调参阈值,只需要根据需要相应修改95-99行 / 101-105行
6)对于LSTM模型,109-115行参数不一定适合特定研究,可以自行尝试修改。
'''
脚本功能:实现风速预报的数据读取、机器学习建模(random forest、XGB)
要求:
数据存放位置顺序
-文件夹
-Scripts
-main.py
-LSTM.py
-Functs_LSTM.py
-imf1.xlsx
-imf2.xlsx
-imf3.xlsx
-imf4.xlsx
-imf5.xlsx
-imf6.xlsx
-res.xlsx
'''
#==申明一些基本函数库
import numpy as np
import pandas as pd
import time
import os
import code
#==申明机器学习算法需要的函数库
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
#==申明深度学习需要的库
from LSTMFunctions import train_modelLSTM
from LSTMFunctions import test_modelLSTM
#==申明画图需要的库
import matplotlib.pyplot as plt
def calculate_evaluatematrix(real, predict):
loss_mse = np.mean(np.power(real - predict, 2))
loss_rmse = np.sqrt(np.mean(np.power(real - predict, 2)))
loss_mae=np.mean(np.abs(real - predict))
loss_mape = np.mean( np.abs(( real - predict )/ predict ))
return loss_mse,loss_rmse,loss_mae,loss_mape
def plots(real, predict, explain):
# ====real和predict 都是一维度 风速
plt.figure(figsize=(15, 15))
plt.plot(real, 'b', label='real')
plt.plot(predict, 'r', label='predict')
plt.legend()
plt.savefig('./%s_figure1.png' % explain, dpi=600, bbox_inches='tight')
plt.show()
plt.figure(figsize=(15, 15))
plt.plot(real[-50:-1], 'b', label='real')
plt.plot(predict[-50:-1], 'r', label='predict')
plt.legend()
plt.savefig('./%s_figure2.png' % explain, dpi=600, bbox_inches='tight')
plt.show()
def train_model(x_train, y_train, x_test, paremeters, model_name, cv_state=True):
'''
Parameters
----------
x_train : 训练集 x, np.array类型二维数组, [samples_train,features_train]
y_train : 训练集 y np.array类型一维数组, [samples_train]
x_test : 测试集 x np.array类型二维数组, [samples_test,features_test]
model_name : 选用什么模型 :random forest / XGB
paremeters: 需要调的参数
cv_state: 是否需要模型因子挑选
Returns
-------
y_predictoftest: 根据测试集 x 得到的预报测试集y
'''
if cv_state: # cv_state=True 则使用gridsearchCv 挑选最优参数
if model_name == 'random forest':
model = RandomForestRegressor(random_state=1, criterion='squared_error')
# 挑选参数
grid = GridSearchCV(model, paremeters, cv=10, scoring="neg_mean_squared_error", verbose=10)
grid.fit(x_train, y_train)
print('best_params_=', grid.best_params_)
print('best_score_=', grid.best_score_)
model = RandomForestRegressor(random_state=1, criterion='squared_error',
max_features=grid.best_params_['max_features'],
min_samples_leaf=grid.best_params_['min_samples_leaf'],
max_depth=grid.best_params_['max_depth'])
elif model_name == 'XGB':
model = xgb.XGBRegressor(random_state=1)
# 挑选参数
grid = GridSearchCV(model, paremeters, cv=10, scoring="neg_mean_squared_error", verbose=10)
grid.fit(x_train, y_train)
print('best_params_=', grid.best_params_)
print('best_score_=', grid.best_score_)
model = xgb.XGBRegressor(random_state=1,
eta=grid.best_params_['eta'],
max_depth=grid.best_params_['max_depth'],
n_estimators=grid.best_params_['n_estimators'],
gamma=grid.best_params_['gamma'])
else: # cv_state=False 则根据自己需要修改模型参数后直接推理
if model_name == 'random forest':
model = RandomForestRegressor(random_state=1, criterion='squared_error', max_depth=7, max_features=31,
min_samples_leaf=10) # random_state=1,criterion='mse',max_depth=7,max_features=31,min_samples_leaf=10
elif model_name == 'XGB':
# model = xgb.XGBRegressor(random_state=1, learning_rate=0.1, max_depth=2, n_estimators=100)
model = xgb.XGBRegressor(random_state=1, gamma=0.1, max_depth=3, n_estimators=100)
regr = model.fit(x_train, y_train)
y_predictoftest = regr.predict(x_test)
return y_predictoftest
def run_main(ModelSelected0, CvState0, TrainPercent0, ValidPercent0, Paremeters0):
# =====开始运行脚本
# ===获取main.py脚本所在文件夹
PathCurrent = os.getcwd()
# ===获取main.py脚本所在文件夹的上一级文件夹
PathParent = os.path.dirname(PathCurrent)
# ===自动获取数据文件名
FilesList = sorted(os.listdir(PathParent))
DataList = [z for z in FilesList if z.endswith('.xlsx')]
# ===对每个数据循环读取数据、构建模型
for id1, DataListEach in enumerate(DataList):
print('文件%s开始处理' % DataListEach)
# ===每个文件处理计时间
t00 = time.time()
# ===用pandas读取每个文件的数据 144行*1095列 用前一列当做x预报下一列
DataEach = pd.read_excel(os.path.join(PathParent, DataListEach), header=None) #
NumColunms = DataEach.columns.shape[0]
DataEachXTrain = DataEach.values[:144, :int(NumColunms * TrainPercent0)].T # 需要转至一下 变成样本*变量
DataEachYTrain = DataEach.values[:144, 1:int(NumColunms * TrainPercent0) + 1].T # 与DataEachXTrain错位一列 需要转至一下 变成样本*变量
DataEachXTest = DataEach.values[:144, int(NumColunms * TrainPercent0):-1].T # 需要转至一下 变成样本*变量
DataEachYTest = DataEach.values[:144, int(NumColunms * TrainPercent0) + 1:].T # 与DataEachXTest错位一列 需要转至一下 变成样本*变量
if ModelSelected0 in ['random forest', 'XGB']:
y_predictoftest = train_model(DataEachXTrain, DataEachYTrain, DataEachXTest, Paremeters0, ModelSelected0,
cv_state=CvState0)
else:
# LSTM模型需要验证集 从训练集中划分
DataEachXTrain_ = DataEachXTrain[:int(DataEachXTrain.shape[0] * ValidPercent0)]
DataEachYTrain_ = DataEachYTrain[:int(DataEachYTrain.shape[0] * ValidPercent0)]
DataEachXValid_ = DataEachXTrain[int(DataEachXTrain.shape[0] * ValidPercent0):]
DataEachYValid_ = DataEachYTrain[int(DataEachYTrain.shape[0] * ValidPercent0):]
NetFile = train_modelLSTM(DataEachXTrain_, DataEachYTrain_, DataEachXValid_, DataEachYValid_, Paremeters0)
y_predictoftest = test_modelLSTM(DataEachXTest, NetFile, Paremeters0)
# ====评估数据
loss_mse,loss_rmse,loss_mae,loss_mape = calculate_evaluatematrix(DataEachYTest, y_predictoftest)
print('文件%s 模型评价效果:MSE: %.3f | RMSE: %.3f | MAE: %.3f | MAPE: %.3f' % (DataListEach, loss_mse,loss_rmse,loss_mae,loss_mape))
# ====画图====
plots(DataEachYTest, y_predictoftest, DataListEach)
# ====保存数据
y_predictoftest__ = pd.DataFrame(y_predictoftest)
DataEachYTest__ = pd.DataFrame(DataEachYTest)
# 保存到 Excel 文件
SaveFileName = 'PredictAndRealInTestData_%s' % DataListEach
with pd.ExcelWriter(SaveFileName) as writer:
y_predictoftest__.to_excel(writer, sheet_name='Predict', index=False)
DataEachYTest__.to_excel(writer, sheet_name='Real', index=False)
print('文件%s结束处理 | 耗时:%.3f 分钟' % (DataListEach, (time.time() - t00) / 60))
if __name__ == "__main__":
t0 = time.time()
print('------------------------------main.py脚本开始运行---------------------------------')
#==============================Step1: 提前手动定义一些参数=========================
#=1.手动设置模型
ModelSelected = 'random forest'
#=2.手动设置是否需要调参
CvState = False#是否进行参数挑选参数 如果不需要则设置为False
#=3.手动设置训练集和测试集比例
TrainPercent = 0.6 # 所有样本根据train:test = 6:4 比例划分
ValidPercent = 0 # XGB RF不需要ValidPercent 只是控制LSTM的
#=4.CvState = True 的情况下 手动设置需要调节的参数的阈值区间 !注意!:如果要添加新参数,注意需要在train_model 52/63行中也添加相应参数
if ModelSelected == 'random forest':
Paremeters = [{"max_features": range(1, 32, 3),
"min_samples_leaf": range(1, 20, 3),
"max_depth": range(1, 20, 3)
}]
elif ModelSelected == 'XGB':
Paremeters = [{"eta": [0.3, 0.2, 0.1],
"max_depth": [3, 5, 6, 10, 20],
"n_estimators": [100, 200, 500],
'gamma': [0, 0.1, 0.2, 0.5, 1]
}]
elif ModelSelected == 'LSTM': # !注意!:LSTM模型无法遍历搜索最优参数 这边只是实现手动修改参数阈值
#==还需要手动设置验证集比例 验证集是从训练集中抽取的
ValidPercent = 0.2
Paremeters = {"epoch": 200,
"batchsize": 16,
"num_hidden": 32, #隐层神经元数
'num_layers': 3, #LSTM层数
'inputsize': 1, #特征数
'outputsize': 144, #output长度
'sequence': 144, #input时序长度
}
code.interact(local=locals())
# ================================Step2: 开始运行脚本=================================
run_main(ModelSelected, CvState, TrainPercent, ValidPercent, Paremeters)
print('------------------------------main.py脚本结束运行 | 耗时:%.3f 分钟---------------------------------' % (time.time() - t0)/60)
code.interact(local = locals())
main.py 中调用的Functs_LSTM.py 脚本如下:
#==申明一些基本函数库
import numpy as np
import pandas as pd
import os
import time
#==申明LSTM算法需要的函数库
import torch
import torch.utils.data as Data
from LSTM import lstm
import torch.optim as optim
import torch.nn as nn
import matplotlib.pyplot as plt
def setup_seed(seed):
'''固定随机种子'''
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
torch.backends.cudnn.deterministic = True
class MyDataset(Data.Dataset):
def __init__(self, x, y):
self.x = x
self.y = y
def __len__(self):
return len(self.y)
def __getitem__(self, index):
xx = self.x[index]
yy = self.y[index]
return xx, yy
def train_modelLSTM(x_train, y_train, x_validation, y_validation, paremeters):
# =================传递参数===================
BATCH_SIZE = paremeters.get('batchsize')
Epoch = paremeters.get('epoch')
num_hidden = paremeters.get('num_hidden')
num_layers = paremeters.get('num_layers')
inputsize = paremeters.get('inputsize')
outputsize = paremeters.get('outputsize')
sequence = paremeters.get('sequence')
lrate = 0.001
echo_interval = 10
t0_train = time.time()
# ===================模型一些准备 begin=================
#default_device = torch.device("cuda:1") if torch.cuda.is_available() else ("cpu")
#default_device = torch.cuda.set_device(1) if torch.cuda.is_available() else ("cpu")
default_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# net = model_lstm()
net = lstm(inputsize, num_hidden, outputsize, num_layers,sequence)
net.to(default_device, dtype=torch.float32)
loss_fn = nn.MSELoss(reduction='mean')
loss_fn.to(default_device, dtype=torch.float32)
optimizer = optim.Adam(net.parameters(), lr=lrate)
print('Hyper Parameters are ok')
# ===================模型一些准备 begin=================
# =====加载dataset 数据集===============
dealDataset_train = MyDataset(x_train, y_train)
print('Process MyDataset_train is done')
train_loader = Data.DataLoader(dataset=dealDataset_train, batch_size=BATCH_SIZE, shuffle=False,
drop_last=True, num_workers=6, pin_memory=False)
# =====================开始训练=====================================================
train_step = 0
loss_epoch = []
loss_validation_epoch = []
tt = time.time()
for epoch in range(Epoch):
net.train()
train_loss = []
batchtime_begin = time.time()
for inputs, labels in train_loader:
hidden_train = None # 如果认为每个batch之间 顺序有关联 则hidden_train放在最外面 然后使用111-112行 这样每个batchsize继承更新hidden_train 现在放在for里面表示每个batchsize都要用新的hidden_train
inputs = inputs.to(device=default_device, dtype=torch.float32)
labels = labels.to(device=default_device, dtype=torch.float32)
if train_step == 0:
print("input:shape", inputs.shape, labels.shape)
optimizer.zero_grad()
# outputs, (h_0, c_0) = net(inputs.unsqueeze(-1), hidden_train)
outputs = net(inputs.unsqueeze(-1))
loss = loss_fn(outputs.squeeze(), labels)
loss.backward()
optimizer.step()
train_loss.append(loss.detach().cpu().numpy())
train_step += 1
if train_step % echo_interval == 0:
tpb = (time.time() - tt) / echo_interval
tt = time.time()
loss_avg = np.average(train_loss)
print("[Epoch %5d | Batch %5d] loss : %5.5f | Time/Batch: %5.5fs" % (
epoch, (train_step - 1), loss_avg, tpb))
loss_ = np.average(train_loss)
print('epoch=', epoch, 'loss=', loss_)
loss_epoch.append(loss_)
batchtime_end = time.time()
print('Batch time=', batchtime_end - batchtime_begin)
# ================验证集================
net.eval()
with torch.no_grad():
x_validation_torch = torch.from_numpy(x_validation).to(device=default_device, dtype=torch.float32)
y_predictofvalidation = net(x_validation_torch.unsqueeze(-1)).detach().cpu().numpy().squeeze()
mse_validation = calculate_mse(y_validation, y_predictofvalidation)
loss_validation_epoch.append(mse_validation)
print('epoch=', epoch, '验证集mse=', mse_validation)
t1_train = time.time()
#====保存
if not os.path.exists(os.path.join(os.getcwd(), 'LSTM')):
os.makedirs(os.path.join(os.getcwd(), 'LSTM'))
np.save(os.path.join(os.getcwd(), 'LSTM', 'loss_epoch_%d_batch_%d.npy' % (epoch, BATCH_SIZE)), np.array(loss_epoch))
np.save(os.path.join(os.getcwd(), 'LSTM', 'loss_validation_epoch_%d_batch_%d.npy' % (epoch, BATCH_SIZE)), np.array(loss_validation_epoch))
print('Training time=', t1_train - t0_train)
print('begin save model')
# =============保存loss曲线==========
plt.figure(figsize=(15, 15))
plt.plot(loss_epoch, 'r', label='loss_train')
plt.plot(loss_validation_epoch, 'k', label='loss_validation_epoch')
plt.legend()
plt.savefig(os.path.join(os.getcwd(), 'LSTM', 'loss_train_validation_LSTM.png'))
# ======== save model===============
net_file = os.path.join(os.getcwd(), 'LSTM', 'model_lstm_epoch_%d_batch_%d.pth' % (epoch, BATCH_SIZE))
torch.save({
'model_state_dict': net.state_dict(),
}, net_file)
print("save net to file %s" % net_file)
return net_file
def test_modelLSTM(x_test, net_file, paremeters):
# ===================模型一些准备 begin=================
num_hidden = paremeters.get('num_hidden')
num_layers = paremeters.get('num_layers')
inputsize = paremeters.get('inputsize')
outputsize = paremeters.get('outputsize')
sequence = paremeters.get('sequence')
#default_device = torch.device("cuda:0") if torch.cuda.is_available() else ("cpu")
default_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
net = lstm(inputsize, num_hidden, outputsize, num_layers, sequence)
net.to(default_device, dtype=torch.float32)
# ===============读取模型=========
pk = torch.load(net_file, map_location=default_device)
net.load_state_dict(pk['model_state_dict'])
with torch.no_grad():
x_test_torch = torch.from_numpy(x_test).to(device=default_device, dtype=torch.float32)
y_predictoftest = net(x_test_torch.unsqueeze(-1))
return y_predictoftest.detach().cpu().numpy().squeeze()
def calculate_mse(real, predict):
loss = np.mean(np.power(real - predict, 2))
return loss
def plots(real, predict, remarks):
# ====real和predict 都是一维度 风速
plt.figure(figsize=(15, 15))
plt.plot(real, 'b', label='real')
plt.plot(predict, 'r', label='predict')
plt.legend()
plt.savefig('test_all_%s.png' % remarks)
plt.figure(figsize=(15, 15))
plt.plot(real[:50], 'b', label='real')
plt.plot(predict[:50], 'r', label='predict')
plt.legend()
plt.savefig('test_50_%s.png' % remarks)
if __name__ == '__main__':
# =====固定随机数==============
seed = 1000
setup_seed(seed) # 是否固定随机种子
Functs_LSTM.py 脚本中调用的LSTM.py 如下:
'''
说明 将隐层数目改成32,加入dropout=0.5
并加入bn relu
'''
#时间序列预测模型LSTM
import torch
import torch.nn as nn
import seaborn as sns #读取seaborn的数据文件,需要ladder
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
class lstm(nn.Module):
def __init__(self, input_size=1, hidden_size=32, output_size=1, num_layer=2,sequence=12):
super(lstm,self).__init__()
self.layer1 = nn.LSTM(input_size,hidden_size,num_layer,batch_first=True,dropout=0.5)
self.relu = nn.ReLU(inplace=True)
self.bn1d=nn.BatchNorm1d(hidden_size)#nn.BatchNorm1d的输入是[batch,feature,sequence]
self.layer2 = nn.Linear(hidden_size*sequence, output_size)
def forward(self,x):
x,_ = self.layer1(x)
s,b,h = x.size()#[batch,sequence,hidden_size]
x_=x.permute(0, 2, 1)
x=self.bn1d(x_)
x = x.reshape(s,-1)
x=self.relu(x)
x = self.layer2(x)
x = self.relu(x)
#x = x.view(s,b,-1)
return x
if __name__ == '__main__':
model = lstm(33, 128, 1, 2,24)
c = torch.randn(32, 24, 33) # (seq, batch_size, feature) => (时序次数、batch_size、特征数)
y_pred = model(c)
print(y_pred.shape)
#print(y_pred)
更多推荐


所有评论(0)