写在前面

背景介绍:对逐十分钟的风速数据进行外推预报,即用T日的144个时序数据(6个/小时*24小时)预报T+1日144个时序数据,可以先通过EEMD等滤波方法分解成不同频率的分量分别建模。文件数据分别如下:

-文件夹
    -Scripts
        -main.py
        -LSTM.py
        -Functs_LSTM.py
    -imf1.xlsx
    -imf2.xlsx
    -imf3.xlsx
    -imf4.xlsx
    -imf5.xlsx
    -imf6.xlsx
    -res.xlsx

代码

main.py 脚本说明:
功能:针对IMF1-6 & Res 数据的时间序列利用Random forest/XGB/LSTM模型进行建模
注意:1)main.py脚本需要放在Scripts文件夹中,Scripts文件夹放在IMF1-6 & Res 数据同级目录
          2)Scripts文件夹还有LSTM.py和LSTMFunctions.py脚本,是用来调用LSTM模型用的。
          3)选择不同模型是,只需要更改main.py中89行:ModelSelected = 'random forest'   可选的是'random forest' / 'XGB' / 'LSTM'
          4)模型推荐需要调参(LSTM模型除外),CvState = True;如果不想对random forest和XGB调参,则将main.py中91行修改CvState = False
          5)脚本中针对random forest和XGB调参,写了相应关键参数的阈值调参,如果需要更改调参阈值,只需要根据需要相应修改95-99行 / 101-105行
          6)对于LSTM模型,109-115行参数不一定适合特定研究,可以自行尝试修改。

'''
脚本功能:实现风速预报的数据读取、机器学习建模(random forest、XGB)
要求:
数据存放位置顺序
-文件夹
    -Scripts
        -main.py
        -LSTM.py
        -Functs_LSTM.py
    -imf1.xlsx
    -imf2.xlsx
    -imf3.xlsx
    -imf4.xlsx
    -imf5.xlsx
    -imf6.xlsx
    -res.xlsx
'''
#==申明一些基本函数库
import numpy as np
import pandas as pd
import time
import os
import code
#==申明机器学习算法需要的函数库
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
#==申明深度学习需要的库
from LSTMFunctions import train_modelLSTM
from LSTMFunctions import test_modelLSTM
#==申明画图需要的库
import matplotlib.pyplot as plt

def calculate_evaluatematrix(real, predict):
    loss_mse = np.mean(np.power(real - predict, 2))
    loss_rmse = np.sqrt(np.mean(np.power(real - predict, 2)))
    loss_mae=np.mean(np.abs(real - predict))
    loss_mape = np.mean( np.abs(( real - predict )/ predict ))
    return loss_mse,loss_rmse,loss_mae,loss_mape

def plots(real, predict, explain):
    # ====real和predict 都是一维度 风速
    plt.figure(figsize=(15, 15))
    plt.plot(real, 'b', label='real')
    plt.plot(predict, 'r', label='predict')
    plt.legend()
    plt.savefig('./%s_figure1.png' % explain, dpi=600, bbox_inches='tight')
    plt.show()

    plt.figure(figsize=(15, 15))
    plt.plot(real[-50:-1], 'b', label='real')
    plt.plot(predict[-50:-1], 'r', label='predict')
    plt.legend()
    plt.savefig('./%s_figure2.png' % explain, dpi=600, bbox_inches='tight')
    plt.show()

def train_model(x_train, y_train, x_test, paremeters, model_name, cv_state=True):
    '''
    Parameters
    ----------
    x_train : 训练集 x, np.array类型二维数组, [samples_train,features_train]
    y_train : 训练集 y  np.array类型一维数组, [samples_train]
    x_test  : 测试集 x  np.array类型二维数组, [samples_test,features_test]
    model_name : 选用什么模型 :random forest / XGB
    paremeters: 需要调的参数
    cv_state: 是否需要模型因子挑选
    Returns
    -------
    y_predictoftest: 根据测试集 x 得到的预报测试集y
    '''
    if cv_state:  # cv_state=True 则使用gridsearchCv 挑选最优参数
        if model_name == 'random forest':
            model = RandomForestRegressor(random_state=1, criterion='squared_error')
            # 挑选参数
            grid = GridSearchCV(model, paremeters, cv=10, scoring="neg_mean_squared_error", verbose=10)
            grid.fit(x_train, y_train)
            print('best_params_=', grid.best_params_)
            print('best_score_=', grid.best_score_)
            model = RandomForestRegressor(random_state=1, criterion='squared_error',
                                          max_features=grid.best_params_['max_features'],
                                          min_samples_leaf=grid.best_params_['min_samples_leaf'],
                                          max_depth=grid.best_params_['max_depth'])
        elif model_name == 'XGB':
            model = xgb.XGBRegressor(random_state=1)
            # 挑选参数
            grid = GridSearchCV(model, paremeters, cv=10, scoring="neg_mean_squared_error", verbose=10)
            grid.fit(x_train, y_train)
            print('best_params_=', grid.best_params_)
            print('best_score_=', grid.best_score_)
            model = xgb.XGBRegressor(random_state=1,
                                     eta=grid.best_params_['eta'],
                                     max_depth=grid.best_params_['max_depth'],
                                     n_estimators=grid.best_params_['n_estimators'],
                                     gamma=grid.best_params_['gamma'])
    else:  # cv_state=False 则根据自己需要修改模型参数后直接推理
        if model_name == 'random forest':
            model = RandomForestRegressor(random_state=1,  criterion='squared_error', max_depth=7, max_features=31,
                                          min_samples_leaf=10)  # random_state=1,criterion='mse',max_depth=7,max_features=31,min_samples_leaf=10
        elif model_name == 'XGB':
            # model = xgb.XGBRegressor(random_state=1, learning_rate=0.1, max_depth=2, n_estimators=100)
            model = xgb.XGBRegressor(random_state=1, gamma=0.1, max_depth=3, n_estimators=100)
    regr = model.fit(x_train, y_train)
    y_predictoftest = regr.predict(x_test)
    return y_predictoftest

def run_main(ModelSelected0, CvState0, TrainPercent0, ValidPercent0, Paremeters0):
    # =====开始运行脚本
    # ===获取main.py脚本所在文件夹
    PathCurrent = os.getcwd()
    # ===获取main.py脚本所在文件夹的上一级文件夹
    PathParent = os.path.dirname(PathCurrent)
    # ===自动获取数据文件名
    FilesList = sorted(os.listdir(PathParent))
    DataList = [z for z in FilesList if z.endswith('.xlsx')]
    # ===对每个数据循环读取数据、构建模型
    for id1, DataListEach in enumerate(DataList):
        print('文件%s开始处理' % DataListEach)
        # ===每个文件处理计时间
        t00 = time.time()
        # ===用pandas读取每个文件的数据 144行*1095列  用前一列当做x预报下一列
        DataEach = pd.read_excel(os.path.join(PathParent, DataListEach), header=None)  #
        NumColunms = DataEach.columns.shape[0]
        DataEachXTrain = DataEach.values[:144, :int(NumColunms * TrainPercent0)].T  # 需要转至一下 变成样本*变量
        DataEachYTrain = DataEach.values[:144, 1:int(NumColunms * TrainPercent0) + 1].T  # 与DataEachXTrain错位一列 需要转至一下 变成样本*变量
        DataEachXTest = DataEach.values[:144, int(NumColunms * TrainPercent0):-1].T  # 需要转至一下 变成样本*变量
        DataEachYTest = DataEach.values[:144, int(NumColunms * TrainPercent0) + 1:].T  # 与DataEachXTest错位一列 需要转至一下 变成样本*变量
        if ModelSelected0 in ['random forest', 'XGB']:
            y_predictoftest = train_model(DataEachXTrain, DataEachYTrain, DataEachXTest, Paremeters0, ModelSelected0,
                                          cv_state=CvState0)
        else:
            # LSTM模型需要验证集 从训练集中划分
            DataEachXTrain_ = DataEachXTrain[:int(DataEachXTrain.shape[0] * ValidPercent0)]
            DataEachYTrain_ = DataEachYTrain[:int(DataEachYTrain.shape[0] * ValidPercent0)]
            DataEachXValid_ = DataEachXTrain[int(DataEachXTrain.shape[0] * ValidPercent0):]
            DataEachYValid_ = DataEachYTrain[int(DataEachYTrain.shape[0] * ValidPercent0):]
            NetFile = train_modelLSTM(DataEachXTrain_, DataEachYTrain_, DataEachXValid_, DataEachYValid_, Paremeters0)
            y_predictoftest = test_modelLSTM(DataEachXTest, NetFile, Paremeters0)
        # ====评估数据
        loss_mse,loss_rmse,loss_mae,loss_mape = calculate_evaluatematrix(DataEachYTest, y_predictoftest)
        print('文件%s 模型评价效果:MSE: %.3f | RMSE: %.3f | MAE: %.3f | MAPE: %.3f' % (DataListEach, loss_mse,loss_rmse,loss_mae,loss_mape))
        # ====画图====
        plots(DataEachYTest, y_predictoftest, DataListEach)
        # ====保存数据
        y_predictoftest__ = pd.DataFrame(y_predictoftest)
        DataEachYTest__ = pd.DataFrame(DataEachYTest)
        # 保存到 Excel 文件
        SaveFileName = 'PredictAndRealInTestData_%s' % DataListEach
        with pd.ExcelWriter(SaveFileName) as writer:
            y_predictoftest__.to_excel(writer, sheet_name='Predict', index=False)
            DataEachYTest__.to_excel(writer, sheet_name='Real', index=False)
        print('文件%s结束处理 | 耗时:%.3f 分钟' % (DataListEach, (time.time() - t00) / 60))





if __name__ == "__main__":
    t0 = time.time()
    print('------------------------------main.py脚本开始运行---------------------------------')
    #==============================Step1: 提前手动定义一些参数=========================
    #=1.手动设置模型
    ModelSelected = 'random forest'
    #=2.手动设置是否需要调参
    CvState = False#是否进行参数挑选参数 如果不需要则设置为False
    #=3.手动设置训练集和测试集比例
    TrainPercent = 0.6  # 所有样本根据train:test = 6:4 比例划分
    ValidPercent = 0 # XGB RF不需要ValidPercent 只是控制LSTM的
    #=4.CvState = True 的情况下 手动设置需要调节的参数的阈值区间  !注意!:如果要添加新参数,注意需要在train_model 52/63行中也添加相应参数
    if ModelSelected == 'random forest':
        Paremeters = [{"max_features": range(1, 32, 3),
                       "min_samples_leaf": range(1, 20, 3),
                       "max_depth": range(1, 20, 3)
                       }]
    elif ModelSelected == 'XGB':
        Paremeters = [{"eta": [0.3, 0.2, 0.1],
                       "max_depth": [3, 5, 6, 10, 20],
                       "n_estimators": [100, 200, 500],
                       'gamma': [0, 0.1, 0.2, 0.5, 1]
                       }]
    elif ModelSelected == 'LSTM':  # !注意!:LSTM模型无法遍历搜索最优参数 这边只是实现手动修改参数阈值
        #==还需要手动设置验证集比例 验证集是从训练集中抽取的
        ValidPercent = 0.2
        Paremeters = {"epoch": 200,
                       "batchsize": 16,
                       "num_hidden": 32,  #隐层神经元数
                       'num_layers': 3,  #LSTM层数
                       'inputsize': 1, #特征数
                       'outputsize': 144,  #output长度
                       'sequence': 144, #input时序长度
                       }
    code.interact(local=locals())
    # ================================Step2: 开始运行脚本=================================
    run_main(ModelSelected, CvState, TrainPercent, ValidPercent, Paremeters)
    print('------------------------------main.py脚本结束运行 | 耗时:%.3f 分钟---------------------------------' % (time.time() - t0)/60)
    code.interact(local = locals())
main.py 中调用的Functs_LSTM.py 脚本如下:
#==申明一些基本函数库
import numpy as np
import pandas as pd
import os
import time
#==申明LSTM算法需要的函数库
import torch
import torch.utils.data as Data
from LSTM import lstm
import torch.optim as optim
import torch.nn as nn
import matplotlib.pyplot as plt

def setup_seed(seed):
    '''固定随机种子'''
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    torch.backends.cudnn.deterministic = True

class MyDataset(Data.Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, index):
        xx = self.x[index]
        yy = self.y[index]
        return xx, yy


def train_modelLSTM(x_train, y_train, x_validation, y_validation, paremeters):
    # =================传递参数===================
    BATCH_SIZE = paremeters.get('batchsize')
    Epoch = paremeters.get('epoch')
    num_hidden = paremeters.get('num_hidden')
    num_layers = paremeters.get('num_layers')
    inputsize = paremeters.get('inputsize')
    outputsize = paremeters.get('outputsize')
    sequence = paremeters.get('sequence')
    lrate = 0.001
    echo_interval = 10
    t0_train = time.time()
    # ===================模型一些准备 begin=================
    #default_device = torch.device("cuda:1") if torch.cuda.is_available() else ("cpu")
    #default_device = torch.cuda.set_device(1) if torch.cuda.is_available() else ("cpu")
    default_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # net = model_lstm()
    net = lstm(inputsize, num_hidden, outputsize, num_layers,sequence)
    net.to(default_device, dtype=torch.float32)

    loss_fn = nn.MSELoss(reduction='mean')
    loss_fn.to(default_device, dtype=torch.float32)
    optimizer = optim.Adam(net.parameters(), lr=lrate)
    print('Hyper Parameters are ok')
    # ===================模型一些准备 begin=================

    # =====加载dataset 数据集===============
    dealDataset_train = MyDataset(x_train, y_train)
    print('Process MyDataset_train is done')
    train_loader = Data.DataLoader(dataset=dealDataset_train, batch_size=BATCH_SIZE, shuffle=False,
                                   drop_last=True, num_workers=6, pin_memory=False)

    # =====================开始训练=====================================================
    train_step = 0
    loss_epoch = []
    loss_validation_epoch = []
    tt = time.time()
    for epoch in range(Epoch):
        net.train()
        train_loss = []
        batchtime_begin = time.time()
        for inputs, labels in train_loader:
            hidden_train = None  # 如果认为每个batch之间 顺序有关联 则hidden_train放在最外面 然后使用111-112行 这样每个batchsize继承更新hidden_train  现在放在for里面表示每个batchsize都要用新的hidden_train
            inputs = inputs.to(device=default_device, dtype=torch.float32)
            labels = labels.to(device=default_device, dtype=torch.float32)
            if train_step == 0:
                print("input:shape", inputs.shape, labels.shape)
            optimizer.zero_grad()
            # outputs, (h_0, c_0) = net(inputs.unsqueeze(-1), hidden_train)
            outputs = net(inputs.unsqueeze(-1))
            loss = loss_fn(outputs.squeeze(), labels)
            loss.backward()
            optimizer.step()
            train_loss.append(loss.detach().cpu().numpy())
            train_step += 1
            if train_step % echo_interval == 0:
                tpb = (time.time() - tt) / echo_interval
                tt = time.time()
                loss_avg = np.average(train_loss)
                print("[Epoch %5d | Batch %5d] loss : %5.5f | Time/Batch: %5.5fs" % (
                    epoch, (train_step - 1), loss_avg, tpb))
        loss_ = np.average(train_loss)
        print('epoch=', epoch, 'loss=', loss_)
        loss_epoch.append(loss_)
        batchtime_end = time.time()
        print('Batch time=', batchtime_end - batchtime_begin)

        # ================验证集================
        net.eval()
        with torch.no_grad():
            x_validation_torch = torch.from_numpy(x_validation).to(device=default_device, dtype=torch.float32)
            y_predictofvalidation = net(x_validation_torch.unsqueeze(-1)).detach().cpu().numpy().squeeze()
            mse_validation = calculate_mse(y_validation, y_predictofvalidation)
            loss_validation_epoch.append(mse_validation)
            print('epoch=', epoch, '验证集mse=', mse_validation)

    t1_train = time.time()
    #====保存
    if not os.path.exists(os.path.join(os.getcwd(), 'LSTM')):
        os.makedirs(os.path.join(os.getcwd(), 'LSTM'))
    np.save(os.path.join(os.getcwd(), 'LSTM', 'loss_epoch_%d_batch_%d.npy' % (epoch, BATCH_SIZE)), np.array(loss_epoch))
    np.save(os.path.join(os.getcwd(), 'LSTM', 'loss_validation_epoch_%d_batch_%d.npy' % (epoch, BATCH_SIZE)), np.array(loss_validation_epoch))
    print('Training time=', t1_train - t0_train)
    print('begin save model')
    # =============保存loss曲线==========
    plt.figure(figsize=(15, 15))
    plt.plot(loss_epoch, 'r', label='loss_train')
    plt.plot(loss_validation_epoch, 'k', label='loss_validation_epoch')
    plt.legend()
    plt.savefig(os.path.join(os.getcwd(), 'LSTM', 'loss_train_validation_LSTM.png'))

    # ======== save model===============
    net_file = os.path.join(os.getcwd(), 'LSTM', 'model_lstm_epoch_%d_batch_%d.pth' % (epoch, BATCH_SIZE))
    torch.save({
        'model_state_dict': net.state_dict(),
    }, net_file)
    print("save net to file %s" % net_file)

    return net_file


def test_modelLSTM(x_test, net_file, paremeters):
    # ===================模型一些准备 begin=================
    num_hidden = paremeters.get('num_hidden')
    num_layers = paremeters.get('num_layers')
    inputsize = paremeters.get('inputsize')
    outputsize = paremeters.get('outputsize')
    sequence = paremeters.get('sequence')
    #default_device = torch.device("cuda:0") if torch.cuda.is_available() else ("cpu")
    default_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    net = lstm(inputsize, num_hidden, outputsize, num_layers, sequence)
    net.to(default_device, dtype=torch.float32)
    # ===============读取模型=========
    pk = torch.load(net_file, map_location=default_device)
    net.load_state_dict(pk['model_state_dict'])
    with torch.no_grad():
        x_test_torch = torch.from_numpy(x_test).to(device=default_device, dtype=torch.float32)
        y_predictoftest = net(x_test_torch.unsqueeze(-1))

    return y_predictoftest.detach().cpu().numpy().squeeze()


def calculate_mse(real, predict):
    loss = np.mean(np.power(real - predict, 2))
    return loss


def plots(real, predict, remarks):
    # ====real和predict 都是一维度 风速
    plt.figure(figsize=(15, 15))
    plt.plot(real, 'b', label='real')
    plt.plot(predict, 'r', label='predict')
    plt.legend()
    plt.savefig('test_all_%s.png' % remarks)

    plt.figure(figsize=(15, 15))
    plt.plot(real[:50], 'b', label='real')
    plt.plot(predict[:50], 'r', label='predict')
    plt.legend()
    plt.savefig('test_50_%s.png' % remarks)


if __name__ == '__main__':
    # =====固定随机数==============
    seed = 1000
    setup_seed(seed)  # 是否固定随机种子

Functs_LSTM.py 脚本中调用的LSTM.py 如下:

'''
说明 将隐层数目改成32,加入dropout=0.5
并加入bn relu

'''
#时间序列预测模型LSTM
import torch
import torch.nn as nn
import seaborn as sns  #读取seaborn的数据文件,需要ladder
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler


class lstm(nn.Module):
    def __init__(self, input_size=1, hidden_size=32, output_size=1, num_layer=2,sequence=12):
        super(lstm,self).__init__()
        self.layer1 = nn.LSTM(input_size,hidden_size,num_layer,batch_first=True,dropout=0.5)
        self.relu = nn.ReLU(inplace=True)
        self.bn1d=nn.BatchNorm1d(hidden_size)#nn.BatchNorm1d的输入是[batch,feature,sequence]
        self.layer2 = nn.Linear(hidden_size*sequence, output_size)
    
    def forward(self,x):
        x,_ = self.layer1(x)
        s,b,h = x.size()#[batch,sequence,hidden_size]
        x_=x.permute(0, 2, 1)
        x=self.bn1d(x_)
        x = x.reshape(s,-1)
        x=self.relu(x)
        x = self.layer2(x)
        x = self.relu(x)
        #x = x.view(s,b,-1)
        return x

if __name__ == '__main__':
    model = lstm(33, 128, 1, 2,24)
    c = torch.randn(32, 24, 33)  # (seq, batch_size, feature) => (时序次数、batch_size、特征数)
    y_pred = model(c)
    print(y_pred.shape)
    #print(y_pred)

Logo

有“AI”的1024 = 2048,欢迎大家加入2048 AI社区

更多推荐